void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
/* Naive implementation of Matrix Matrix Multiplication @param A input matrix @param B input matrix @param C output matrix */ inline void naive(const Matrix& A, const Matrix& B, Matrix& C){ //preload dimensions for faster access int dimM = C.getDimM(); int dimN = C.getDimN(); int dimL = A.getDimN(); for (int m = 0; m < dimM; m+=4){ ///rows of c for (int n = 0; n < dimN; n+=4){ ///cols of c //do calculation of a 4x4 block //std::cout << m << "\t" << n << std::endl; __m256d* pA = A.get(m, 0); __m256d* pB = A.get(m+1, 0); __m256d* pC = A.get(m+2, 0); __m256d* pD = A.get(m+3, 0); __m256d* pK = B.getT(0, n); __m256d* pL = B.getT(0, n+1); __m256d* pM = B.getT(0, n+2); __m256d* pN = B.getT(0, n+3); //std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl; __m256d K = _mm256_setzero_pd(); __m256d L = _mm256_setzero_pd(); __m256d M = _mm256_setzero_pd(); __m256d N = _mm256_setzero_pd(); __m256d O = _mm256_setzero_pd(); __m256d P = _mm256_setzero_pd(); __m256d Q = _mm256_setzero_pd(); __m256d R = _mm256_setzero_pd(); __m256d S = _mm256_setzero_pd(); __m256d T = _mm256_setzero_pd(); __m256d U = _mm256_setzero_pd(); __m256d V = _mm256_setzero_pd(); __m256d W = _mm256_setzero_pd(); __m256d X = _mm256_setzero_pd(); __m256d Y = _mm256_setzero_pd(); __m256d Z = _mm256_setzero_pd(); for (int l = 0; l < dimL; l+=4){ //std::cout <<"mul" << std::endl; K = K + (*pA) * (*pK); L = L + (*pA) * (*pL); M = M + (*pA) * (*pM); N = N + (*pA) * (*pN); O = O + (*pB) * (*pK); P = P + (*pB) * (*pL); Q = Q + (*pB) * (*pM); R = R + (*pB) * (*pN); S = S + (*pC) * (*pK); T = T + (*pC) * (*pL); U = U + (*pC) * (*pM); V = V + (*pC) * (*pN); W = W + (*pD) * (*pK); X = X + (*pD) * (*pL); Y = Y + (*pD) * (*pM); Z = Z + (*pD) * (*pN); //std::cout << "inc" <<std::endl; pA++; pB++; pC++; pD++; pK++; pL++; pM++; pN++; } // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} __m256d sumab = _mm256_hadd_pd(K, L); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} __m256d sumcd = _mm256_hadd_pd(M, N); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); __m256d sum = _mm256_add_pd(perm, blend); C.set(m, n, sum); //C(m , n) = K[0] + K[1] + K[2] + K[3]; //C(m , n+1) = L[0] + L[1] + L[2] + L[3]; //C(m , n+2) = M[0] + M[1] + M[2] + M[3]; //C(m , n+3) = N[0] + N[1] + N[2] + N[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(O, P); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Q, R); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+1, n, sum); //C(m+1, n ) = O[0] + O[1] + O[2] + O[3]; //C(m+1, n+1) = P[0] + P[1] + P[2] + P[3]; //C(m+1, n+2) = Q[0] + Q[1] + Q[2] + Q[3]; //C(m+1, n+3) = R[0] + R[1] + R[2] + R[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(S, T); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(U, V); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+2, n, sum); //C(m+2, n ) = S[0] + S[1] + S[2] + S[3]; //C(m+2, n+1) = T[0] + T[1] + T[2] + T[3]; //C(m+2, n+2) = U[0] + U[1] + U[2] + U[3]; //C(m+2, n+3) = V[0] + V[1] + V[2] + V[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(W, X); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Y, Z); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+3, n, sum); //C(m+3, n ) = W[0] + W[1] + W[2] + W[3]; //C(m+3, n+1) = X[0] + X[1] + X[2] + X[3]; //C(m+3, n+2) = Y[0] + Y[1] + Y[2] + Y[3]; //C(m+3, n+3) = Z[0] + Z[1] + Z[2] + Z[3]; } } }
/** * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels. */ FORCE_INLINE double single_pixel( double *im, int center, int top, int left, int right, int bottom, const __m256i mask1110, const __m256d rgb0W, const __m256d onehalf, const __m256d minustwelvehalf){ // double r = im[center]; // double g = im[center+1]; // double b = im[center+2]; // double r1 = im[top]; // double g1 = im[top+1]; // double b1 = im[top+2]; // double r2 = im[left]; // double g2 = im[left+1]; // double b2 = im[left+2]; // double r3 = im[right]; // double g3 = im[right+1]; // double b3 = im[right+2]; // double r4 = im[bottom]; // double g4 = im[bottom+1]; // double b4 = im[bottom+2]; __m256d c = _mm256_maskload_pd(&(im[center]),mask1110); __m256d c1 = _mm256_loadu_pd(&(im[top])); __m256d c2 = _mm256_loadu_pd(&(im[left])); __m256d c3 = _mm256_loadu_pd(&(im[right])); __m256d c4 = _mm256_loadu_pd(&(im[bottom])); COST_INC_LOAD(20); // double grey = rw * r + gw * g + bw * b; // double grey1 = rw * r1 + gw * g1 + bw * b1; // double grey2 = rw * r2 + gw * g2 + bw * b2; // double grey3 = rw * r3 + gw * g3 + bw * b3; // double grey4 = rw * r4 + gw * g4 + bw * b4; __m256d greyc = _mm256_mul_pd(c,rgb0W); __m256d grey1 = _mm256_mul_pd(c1,rgb0W); __m256d grey2 = _mm256_mul_pd(c2,rgb0W); __m256d grey3 = _mm256_mul_pd(c3,rgb0W); __m256d grey4 = _mm256_mul_pd(c4,rgb0W); //AVX: double: horizontal add for 1 vector __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2 __m256d c_h = _mm256_hadd_pd(c,c_perm); __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi double c_hsum_lo = _mm_cvtsd_f64(c_h_lo); double c_hsum_hi = _mm_cvtsd_f64(c_h_hi); double c_hsum = c_hsum_lo + c_hsum_hi; //AVX: double: horizontal add for 1 vector __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2 __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm); __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo); double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi); double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi; //AVX: _m256d: horizontal add for 4 vectors at once __m256d grey12 = _mm256_hadd_pd(grey1,grey2); __m256d grey34 = _mm256_hadd_pd(grey3,grey4); __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011 __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2 __m256d grey_1234 = _mm256_add_pd(grey_1234_perm, grey_1234_blend); //AVX: double: horizontal add for 1 vector __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2 __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm); __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo); double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi); double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi; COST_INC_ADD(10); //+ operations wasted on AVX COST_INC_MUL(15); //+ operations wasted on AVX double mu = c_hsum / 3.0; COST_INC_ADD(2); COST_INC_DIV(1); // double rmu = r-mu; // double gmu = g-mu; // double bmu = b-mu; __m256d c_mu = _mm256_set1_pd(mu); __m256d c_rgbmu = _mm256_sub_pd(c,c_mu); COST_INC_ADD(3); //+1 operations wasted on AVX // double rz = r-0.5; // double gz = g-0.5; // double bz = b-0.5; __m256d c_rgbz = _mm256_sub_pd(c,onehalf); COST_INC_ADD(3); //+1 operations wasted on AVX // double rzrz = rz*rz; // double gzgz = gz*gz; // double bzbz = bz*bz; __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz); COST_INC_MUL(3); //+1 operations wasted on AVX // double re = exp(-12.5*rzrz); // double ge = exp(-12.5*gzgz); // double be = exp(-12.5*bzbz); __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq); __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp); __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps); __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps); COST_INC_EXP(3); COST_INC_MUL(3); //+1 operations wasted on AVX // double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0); __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu); __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo); double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi); double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum); double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo; double t1_tmp2 = t1_tmp1 / 3.0; double t1 = sqrt(t1_tmp2); COST_INC_SQRT(1); COST_INC_ADD(3); COST_INC_MUL(3); //+1 operations wasted on AVX COST_INC_DIV(1); double t2 = fabs(t1); COST_INC_ABS(1); // double t3 = re*ge*be; __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo); double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi); __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped); double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo; COST_INC_MUL(2); double t4 = fabs(t3); COST_INC_ABS(1); double t5 = t2 * t4; COST_INC_MUL(1); // double t6 = -4.0*grey+grey1+grey2+grey3+grey4; double minusfour_times_grey = -4.0*greyc_hsum; double t6 = minusfour_times_grey+grey1234_sum; COST_INC_MUL(1); COST_INC_ADD(2); //2 operations saved due to AVX double t7 = fabs(t6); COST_INC_ABS(1); double t8 = t5 * t7; COST_INC_MUL(1); double t9 = t8 + 1.0E-12; COST_INC_ADD(1); return t9; }
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator) { AVX2TABLETYPE *mp; DBL sum = 0.0; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator == kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } const __m256d ONE_PD = _mm256_set1_pd(1); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d sumr = _mm256_setzero_pd(); __m256d sumr1 = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj); { sumr = _mm256_add_pd(sumr, sumr1); __m128d sumr_up = _mm256_extractf128_pd(sumr,1); sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up); sumr_up = _mm_hadd_pd(sumr_up,sumr_up); sum = _mm_cvtsd_f64(sumr_up); } if (noise_generator == kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ sum += 1.05242; sum *= 0.48985582; /*sum *= 0.5; sum += 0.5;*/ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } else { sum = sum + 0.5; /* range at this point -0.5 - 0.5... */ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } #if CHECK_FUNCTIONAL { DBL orig_sum = PortableNoise(EPoint, noise_generator); if (fabs(orig_sum - sum) >= EPSILON) { throw POV_EXCEPTION_STRING("Noise error"); } } #endif _mm256_zeroupper(); return (sum); }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* __builtin_prefetch( A + 4*lda );*/ /* __builtin_prefetch( A + 6*lda );*/ /* double *tA, *tx;*/ int k; /* int ka = kmax-kna; // number from aligned positon*/ __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; /* __m128d*/ /* ax_temp,*/ /* a_00_10, a_01_11, a_02_12, a_03_13,*/ /* x_0_1,*/ /* y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/ y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } /* for(; k<ka-3; k+=4)*/ /* {*/ /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ /* x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_00 = _mm256_add_pd( y_00, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_11 = _mm256_add_pd( y_11, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_22 = _mm256_add_pd( y_22, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_33 = _mm256_add_pd( y_33, aaxx_temp );*/ /* */ /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_44 = _mm256_add_pd( y_44, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_55 = _mm256_add_pd( y_55, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_66 = _mm256_add_pd( y_66, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_77 = _mm256_add_pd( y_77, aaxx_temp );*/ /* A += 4 + (sda-1)*lda;*/ /* x += 4;*/ /* }*/ zeros = _mm256_setzero_pd(); // top triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); // top square a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; // bottom triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); // store __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* double *tA, *tx;*/ int k; __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } zeros = _mm256_setzero_pd(); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
void rnn_int_d8x4_var2( int k, double *aa, double *a, double *bb, double *b, double *c, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->I ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->D ) ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Transpose c03/c47 _0, _1, _2, _3 to be the row vector tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 ); tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF ); tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 ); tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF ); tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 ); tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF ); tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 ); tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 ); // c03_0; // c03_1; // c03_2; // c03_3; // c47_0; // c47_1; // c47_2; // c47_3; _mm256_store_pd( c , c03_0.v ); _mm256_store_pd( c + 4, c03_1.v ); _mm256_store_pd( c + 8, c03_2.v ); _mm256_store_pd( c + 12, c03_3.v ); _mm256_store_pd( c + 16, c47_0.v ); _mm256_store_pd( c + 20, c47_1.v ); _mm256_store_pd( c + 24, c47_2.v ); _mm256_store_pd( c + 28, c47_3.v ); }
void ks_gaussian_int_d8x4( int k, double alpha, double *u, double *aa, double *a, double *bb, double *b, double *w, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t u03; v4df_t u47; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; v4df_t w_tmp; //// Inline vdExp() //const double log2e = 1.4426950408889634073599; //const double maxlog = 7.09782712893383996843e2; // log( 2**1024 ) //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 ) //const double one = 1.0; //const double c1 = 6.93145751953125E-1; //const double c2 = 1.42860682030941723212E-6; //// Original Remez Order 11 coefficients //const double w11 = 3.5524625185478232665958141148891055719216674475023e-8; //const double w10 = 2.5535368519306500343384723775435166753084614063349e-7; //const double w9 = 2.77750562801295315877005242757916081614772210463065e-6; //const double w8 = 2.47868893393199945541176652007657202642495832996107e-5; //const double w7 = 1.98419213985637881240770890090795533564573406893163e-4; //const double w6 = 1.3888869684178659239014256260881685824525255547326e-3; //const double w5 = 8.3333337052009872221152811550156335074160546333973e-3; //const double w4 = 4.1666666621080810610346717440523105184720007971655e-2; //const double w3 = 0.166666666669960803484477734308515404418108830469798; //const double w2 = 0.499999999999877094481580370323249951329122224389189; //const double w1 = 1.0000000000000017952745258419615282194236357388884; //const double w0 = 0.99999999999999999566016490920259318691496540598896; // Remez Order 11 polynomail approximation //const double w0 = 9.9999999999999999694541216787022234814339814028865e-1; //const double w1 = 1.0000000000000013347525109964212249781265243645457; //const double w2 = 4.9999999999990426011279542064313207349934058355357e-1; //const double w3 = 1.6666666666933781279020916199156875162816850273886e-1; //const double w4 = 4.1666666628388978913396218847247771982698350546174e-2; //const double w5 = 8.3333336552944126722390410619859929515740995889372e-3; //const double w6 = 1.3888871805082296012945081624687544823497126781709e-3; //const double w7 = 1.9841863599469418342286677256362193951266072398489e-4; //const double w8 = 2.4787899938611697691690479138150629377630767114546e-5; //const double w9 = 2.7764095757136528235740765949934667970688427190168e-6; //const double w10 = 2.5602485412126369546033948405199058329040797134573e-7; //const double w11 = 3.5347283721656121939634391175390704621351283546671e-8; // Remez Order 9 polynomail approximation // const double w0 = 9.9999999999998657717890998293462356769270934668652e-1; // const double w1 = 1.0000000000041078023971691258305486059867172736079; // const double w2 = 4.9999999979496223000111361187419539211772440139043e-1; // const double w3 = 1.6666667059968250851708016603646727895353772273675e-1; // const double w4 = 4.1666628655740875994884332519499013211594753124142e-2; // const double w5 = 8.3335428149736685441705398632467122758546893330069e-3; // const double w6 = 1.3881912931358424526285652289974115047170651985345e-3; // const double w7 = 1.9983735415194021112767942931416179152416729204150e-4; // const double w8 = 2.3068467290270483679711135625155862511780587976925e-5; // const double w9 = 3.8865682386514872192656192137071689334005518164704e-6; //v4df_t a03_0, a03_1, a03_2, a03_3; //v4df_t a47_0, a47_1, a47_2, a47_3; //v4df_t p03_0, p03_1, p03_2, p03_3; //v4df_t p47_0, p47_1, p47_2, p47_3; //v4df_t y, l2e, tmp, p; //v4li_t k03_0, k03_1, k03_2, k03_3; //v4li_t k47_0, k47_1, k47_2, k47_3; //v4li_t offset; //v4li_t k1, k2; //__m128d p1, p2; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // Prefetch u __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Scale before the kernel evaluation aa_tmp.v = _mm256_broadcast_sd( &alpha ); c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); #include "ks_exp_int_d8x4.h" //printf( "square distance\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error Nan: c47_3[ %d ]\n", i ); // } //} // tmp.v = _mm256_broadcast_sd( &maxlog ); // c03_0.v = _mm256_min_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_min_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_min_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_min_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_min_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_min_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_min_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_min_pd( tmp.v, c47_3.v ); // tmp.v = _mm256_broadcast_sd( &minlog ); // c03_0.v = _mm256_max_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_max_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_max_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_max_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_max_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_max_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_max_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_max_pd( tmp.v, c47_3.v ); // // // a = c / log2e // // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] ) // l2e.v = _mm256_broadcast_sd( &log2e ); // a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v ); // // // Check if a < 0 // tmp.v = _mm256_setzero_pd(); // p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 ); // p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 ); // p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 ); // p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 ); // p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 ); // p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 ); // p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 ); // p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 ); // tmp.v = _mm256_broadcast_sd( &one ); // p03_0.v = _mm256_and_pd( tmp.v, p03_0.v ); // p03_1.v = _mm256_and_pd( tmp.v, p03_1.v ); // p03_2.v = _mm256_and_pd( tmp.v, p03_2.v ); // p03_3.v = _mm256_and_pd( tmp.v, p03_3.v ); // p47_0.v = _mm256_and_pd( tmp.v, p47_0.v ); // p47_1.v = _mm256_and_pd( tmp.v, p47_1.v ); // p47_2.v = _mm256_and_pd( tmp.v, p47_2.v ); // p47_3.v = _mm256_and_pd( tmp.v, p47_3.v ); // // If a < 0 ( w < 0 ), then a - 1 = ( k - 1 ) + w / ln2 // a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v ); // a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v ); // a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v ); // a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v ); // a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v ); // a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v ); // a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v ); // a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v ); // // Compute floor( a ) by two conversions // // if a < 0, p = k - 1 // // else , p = k // k03_0.v = _mm256_cvttpd_epi32( a03_0.v ); // k03_1.v = _mm256_cvttpd_epi32( a03_1.v ); // k03_2.v = _mm256_cvttpd_epi32( a03_2.v ); // k03_3.v = _mm256_cvttpd_epi32( a03_3.v ); // k47_0.v = _mm256_cvttpd_epi32( a47_0.v ); // k47_1.v = _mm256_cvttpd_epi32( a47_1.v ); // k47_2.v = _mm256_cvttpd_epi32( a47_2.v ); // k47_3.v = _mm256_cvttpd_epi32( a47_3.v ); // p03_0.v = _mm256_cvtepi32_pd( k03_0.v ); // p03_1.v = _mm256_cvtepi32_pd( k03_1.v ); // p03_2.v = _mm256_cvtepi32_pd( k03_2.v ); // p03_3.v = _mm256_cvtepi32_pd( k03_3.v ); // p47_0.v = _mm256_cvtepi32_pd( k47_0.v ); // p47_1.v = _mm256_cvtepi32_pd( k47_1.v ); // p47_2.v = _mm256_cvtepi32_pd( k47_2.v ); // p47_3.v = _mm256_cvtepi32_pd( k47_3.v ); // // // --------------------- // // x -= p * ln2 // // --------------------- // // c1 = ln2 // // if a < 0, a = ( k - 1 ) * ln2 // // else , a = k * ln2 // // if a < 0, x -= ( k - 1 ) * ln2 // // else , x -= k * ln2 // // // tmp.v = _mm256_broadcast_sd( &c1 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // tmp.v = _mm256_broadcast_sd( &c2 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // // // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // // // // Prefetch u // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); // // // // // Compute e^x using polynomial approximation // // a = w10 + w11 * x // tmp.v = _mm256_broadcast_sd( &w11 ); // //tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v ); // a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v ); // a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v ); // a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v ); // a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v ); // a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v ); // a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v ); // a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w10 ); // //tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x // tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w7 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w6 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w5 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w4 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Prefetch w // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // // Preload u03 // u03.v = _mm256_load_pd( (double*)u ); // // // tmp.v = _mm256_broadcast_sd( &w3 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w2 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w1 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w0 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Preload u47 // u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 ); // k1.v = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_3.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_3.v = _mm256_set_m128d( p2, p1 ); // // // //u03.v = _mm256_load_pd( (double*)u ); // //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v ); // c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v ); // c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v ); // c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v ); // c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v ); // c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v ); // c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v ); // c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error exp Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error exp Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error exp Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error exp Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error exp Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error exp Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error exp Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error exp Nan: c47_3[ %d ]\n", i ); // } //} //printf( "exp\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //printf( "w\n" ); //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] ); //u03.v = _mm256_load_pd( (double*)u ); //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); w_tmp.v = _mm256_broadcast_sd( (double*)w ); c03_0.v = _mm256_mul_pd( w_tmp.v, c03_0.v ); c47_0.v = _mm256_mul_pd( w_tmp.v, c47_0.v ); u03.v = _mm256_add_pd( u03.v, c03_0.v ); u47.v = _mm256_add_pd( u47.v, c47_0.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 1 ) ); c03_1.v = _mm256_mul_pd( w_tmp.v, c03_1.v ); c47_1.v = _mm256_mul_pd( w_tmp.v, c47_1.v ); u03.v = _mm256_add_pd( u03.v, c03_1.v ); u47.v = _mm256_add_pd( u47.v, c47_1.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 2 ) ); c03_2.v = _mm256_mul_pd( w_tmp.v, c03_2.v ); c47_2.v = _mm256_mul_pd( w_tmp.v, c47_2.v ); u03.v = _mm256_add_pd( u03.v, c03_2.v ); u47.v = _mm256_add_pd( u47.v, c47_2.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 3 ) ); c03_3.v = _mm256_mul_pd( w_tmp.v, c03_3.v ); c47_3.v = _mm256_mul_pd( w_tmp.v, c47_3.v ); u03.v = _mm256_add_pd( u03.v, c03_3.v ); u47.v = _mm256_add_pd( u47.v, c47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} _mm256_store_pd( (double*)u, u03.v ); _mm256_store_pd( (double*)( u + 4 ), u47.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error gemv Nan: c03_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error gemv Nan: c03_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error gemv Nan: c03_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error gemv Nan: c03_3[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error gemv Nan: c47_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error gemv Nan: c47_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error gemv Nan: c47_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error gemv Nan: c47_3[ %d ]\n", i ); // exit( 1 ); // } //} //for ( i = 0; i < 4; i ++ ) { // if ( w[ i ] != w[ i ] ) { // printf( "GSKS error w Nan: w03[ %d ]\n", i ); // } //} //for ( i = 0; i < 4; i++ ) { // if ( u03.d[ i ] != u03.d[ i ] ) { // printf( "GSKS error u Nan: u03[ %d ]\n", i ); // } // if ( u47.d[ i ] != u47.d[ i ] ) { // printf( "GSKS error u Nan: u47[ %d ]\n", i ); // } //} //printf( "%lf\n", u03.d[0] ); //printf( "%lf\n", u03.d[1] ); //printf( "%lf\n", u03.d[2] ); //printf( "%lf\n", u03.d[3] ); //printf( "%lf\n", u47.d[0] ); //printf( "%lf\n", u47.d[1] ); //printf( "%lf\n", u47.d[2] ); //printf( "%lf\n", u47.d[3] ); }