int main() { double rand0 = (double) arc4random(); if (!__builtin_cpu_supports("avx")) { printf("No AVX, skipping test"); return 0; } __m256d y0 = _mm256_set_pd(rand0, 1, -123, 123); __m256d y1 = _mm256_set_pd(-1, rand0, -121, 121); __m256d y2 = _mm256_set_pd(1233.0, -0.1, rand0, 1); __m256d y3 = _mm256_set_pd(0, 23.0, -1, rand0); __m256d result_first = _mm256_sub_pd(y0, y1); __m256d result_second = _mm256_sub_pd(y2, y3); printf("Gonna give the OS opportunity to trash my registers...\n"); struct timeval start, now; int delta; (void)gettimeofday(&start, NULL); do { (void)gettimeofday(&now, NULL); delta = now.tv_sec - start.tv_sec; sched_yield(); } while (delta < 3); printf("Here's the values I got\n"); printf("rand0: %lf\n", rand0); double* first = (double*)&result_first; printf("first %lf\t %lf\t %lf\t %lf\n", first[3], first[2], first[1], first[0]); double* second = (double*)&result_second; printf("second %lf\t %lf\t %lf\t %lf\n", second[3], second[2], second[1], second[0]); printf("Making sure that calculating them by hand gets the same result\n"); #define ALMOST_EQ(A,B) assert((A) - (B) < 0.1) ALMOST_EQ(first[3], rand0 - (-1)); ALMOST_EQ(first[2], 1 - rand0); ALMOST_EQ(first[1], -123 - (-121)); ALMOST_EQ(first[0], 123 - 121); ALMOST_EQ(second[3], 1233.0 - (0)); ALMOST_EQ(second[2], -0.1 - 23.0); ALMOST_EQ(second[1], rand0 - (-1)); ALMOST_EQ(second[0], 1 - rand0); printf("Yep!\n"); return 0; }
double compute_pi(size_t dt) { int i; double pi = 0.0; double delta = 1.0 / dt; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(delta); ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0); ymm4 = _mm256_setzero_pd(); for (i = 0; i <= dt - 4; i += 4) { ymm3 = _mm256_set1_pd(i * delta); ymm3 = _mm256_add_pd(ymm3, ymm2); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_add_pd(ymm0, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm3); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
double calcPi_simd_rcp(void) { double x; int i; double width = 1./(double) num_rects; // __m256d __width = _mm256_set1_pd(width); // __m256d __half = _mm256_set1_pd(0.5); __m256d __four = _mm256_set1_pd(4.0); __m256d __one = _mm256_set1_pd(1.0); // __m256d __sum = _mm256_set1_pd(0.0); __m256d __i = _mm256_set_pd(0.5, 1.5, 2.5, 3.5); for (i = 0; i < num_rects; i += 4) { __m256d __x = __i *__width; __m256d __y = RCP(__one + __x*__x); __sum += __four * __y; // (__one + __x*__x); __i = __i + __four; } double sum; // sum = ((double*) &__sum)[0]; sum += ((double*) &__sum)[1]; sum += ((double*) &__sum)[2]; sum += ((double*) &__sum)[3]; // return width*sum; }
void gvrotg_fma(double *c, double *s, double *r, double a, double b) { #if defined(__FMA__) register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| one = _mm256_set1_pd(1.0); x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} t2 = _mm256_fmadd_pd(t0, t0, one); // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1} u0 = _mm256_sqrt_pd(t2); // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1} u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(a), u(a), u(b), 1/u(b)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} if (fabs(b) > fabs(a)) { *s = b0[3]; *r = b0[2]; *c = b1[2]; if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } #endif }
void static avx_test (void) { int i; union256d u, s1, s2; double e [4]; s1.x = _mm256_set_pd (34545, 95567, 23443, 5675); s2.x = _mm256_set_pd (674, 57897, 93459, 45624); u.x = _mm256_blend_pd (s1.x, s2.x, MASK); for (i = 0; i < 4; i++) e[i] = (MASK & (0x01 << i)) ? s2.a[i] : s1.a[i]; if (check_union256d (u, e)) abort (); }
void gvrotg_avx(double *c, double *s, double *r, double a, double b) { register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} x0 = _mm256_mul_pd(t0, t0); // x3 = {1, (a/b)^2, (b/a)^2, 1} t2 = _mm256_hadd_pd(x0, x0); // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..} u0 = _mm256_sqrt_pd(t2); // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)} one = _mm256_set1_pd(1.0); u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(b), u(b), u(a), 1/u(a)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} if (fabs(b) > fabs(a)) { *s = b0[3]; // = 1/u(b) *r = b0[2]; // = b*u(b) *c = b1[2]; // = t*u(b) if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } }
double compute_pi_leibniz_avx_opt(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256d ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0); ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0); ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0); ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0); ymm13 = _mm256_set1_pd(32.0); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); for (int i = 0; i <= n - 16; i += 16) { ymm9 = _mm256_div_pd(ymm0, ymm1); ymm1 = _mm256_add_pd(ymm1, ymm13); ymm10 = _mm256_div_pd(ymm0, ymm2); ymm2 = _mm256_add_pd(ymm2, ymm13); ymm11 = _mm256_div_pd(ymm0, ymm3); ymm3 = _mm256_add_pd(ymm3, ymm13); ymm12 = _mm256_div_pd(ymm0, ymm4); ymm4 = _mm256_add_pd(ymm4, ymm13); ymm5 = _mm256_add_pd(ymm5, ymm9); ymm6 = _mm256_add_pd(ymm6, ymm10); ymm7 = _mm256_add_pd(ymm7, ymm11); ymm8 = _mm256_add_pd(ymm8, ymm12); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
static void initializeRecurrence(double c, __m256d &factor_prev, __m256d &factor_cur) { /* How to generate: c[0] = 1; c[1] = c; c[n_] := Expand[2 c c[n - 1] - c[n - 2]] last = 2 c; For[i = 3, i <= 9, ++i, last = Expand[(c[i] + last)/c]; Print[last] ] */ double c2 = c*c, temp1 = 2.0*c, temp2 = -1.0+4.0*c2, temp3 = c*(-4.0+8.0*c2), temp4 = 1.0+c2*(-12.0+16.0*c2); factor_prev = _mm256_set_pd(-temp3, -temp2, -temp1, -1.0f); factor_cur = _mm256_set_pd( temp4, temp3, temp2, temp1); }
void static avx_test (void) { union256d u, s1; double e [4] = {0x1.d3881b2c32ed7p+7, 0x1.54abaed51711cp+4, 0x1.19195c08a8d23p+5, 0x1.719741d6c0b0bp+5}; s1.x = _mm256_set_pd (2134.3343,1234.635654,453.345635,54646.464356); u.x = _mm256_sqrt_pd (s1.x); if (check_union256d (u, e)) abort (); }
double compute_pi_leibniz_fma(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); for (int i = 0; i <= n - 4; i += 4) { ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2); ymm4 = _mm256_div_pd(ymm3, ymm4); ymm0 = _mm256_add_pd(ymm0, ymm4); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
void static avx_test (void) { union256d u; double e [4] __attribute__ ((aligned (32))) = {0.0}; u.x = _mm256_set_pd (39578.467285, 7856.342941, 85632.783567, 47563.234215); test (e, u.x); if (check_union256d (u, e)) abort (); }
void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-4); i+=4) { YMM0 = _mm256_loadu_pd(y+i); YMM1 = _mm256_loadu_pd(x+i); YMM2 = _mm256_mul_pd(YMM0, YMM15); YMM3 = _mm256_add_pd(YMM1, YMM2); _mm256_storeu_pd(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256d YMM0 = _mm256_set_pd(c, c, c, c); for (i=0; i<=((n)-16); i+=16) { _mm256_storeu_pd((x)+i , YMM0); _mm256_storeu_pd((x)+i+4, YMM0); _mm256_storeu_pd((x)+i+8, YMM0); _mm256_storeu_pd((x)+i+12, YMM0); } off = (n) - ((n)%16); for (i=0; i<((n)%16); i++) { x[off+i] = c; } }
void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM0 = _mm256_mul_pd(YMM0, YMM15); YMM1 = _mm256_mul_pd(YMM1, YMM15); _mm256_storeu_pd(y+i, YMM0); _mm256_storeu_pd(y+i+4, YMM1); } for (; i<n; i++) { y[i] = x[i] * c; } }
void static avx_test (void) { int i; union256d s1; union128i_d u; int e [4]; s1.x = _mm256_set_pd (2.78, 7777768.82, 23.67, 536.46); u.x = _mm256_cvtpd_epi32 (s1.x); for (i = 0; i < 4; i++) e[i] = (int)(s1.a[i] + 0.5); if (check_union128i_d (u, e)) abort (); }
double compute_pi_euler_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(1.0); ymm2 = _mm256_set1_pd(6.0); for (int i = 0; i <= n - 4; i += 4) { ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm0 = _mm256_add_pd(ymm0, ymm3); } ymm3 = _mm256_mul_pd(ymm2, ymm0); double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sqrt( pi ); }
void ntt_transform(poly out, const poly o) { int s, pos = 0, offset; __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4; __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7; vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE); vp = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.); bitrev(out); vo10 = _mm256_load_pd(o+pos); vo20 = _mm256_load_pd(o+pos+4); neg2 = _mm256_load_pd(_neg2); neg4 = _mm256_load_pd(_neg4); // m = 2, m = 4, m = 8 (3 levels merged) for(s = 0; s<POLY_DEG; s+=8) { // No multiplication with omega required, respective value is 1 vx0 = _mm256_load_pd(out+s); vt = _mm256_mul_pd(vx0,neg2); vx0 = _mm256_hadd_pd(vx0,vt); vx1 = _mm256_load_pd(out+s+4); vt = _mm256_mul_pd(vx1,neg2); vx1 = _mm256_hadd_pd(vx1,vt); vx0 = _mm256_mul_pd(vx0, vo10); vc = _mm256_mul_pd(vx0, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx0 = _mm256_sub_pd(vx0,vc); vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1 vx0 = _mm256_mul_pd(vx0, neg4); vx0 = _mm256_add_pd(vx0, vt); vx1 = _mm256_mul_pd(vx1, vo10); vc = _mm256_mul_pd(vx1, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx1 = _mm256_sub_pd(vx1,vc); vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1 vx1 = _mm256_mul_pd(vx1, neg4); vx1 = _mm256_add_pd(vx1, vt); vt = _mm256_mul_pd(vx1, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx1 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+s+4, vx1); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+s+0, vx0); } pos += 8; // m = 16, m = 32, m = 64 (3 levels merged) for(offset = 0; offset < 8; offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+8); vo11 = _mm256_load_pd(o+pos+offset+16); for(s = 0; s<POLY_DEG; s+=64) { vx1 = _mm256_load_pd(out+offset+s+8); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+8, vx1); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+16); vx3 = _mm256_sub_pd(vx2, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx2 = _mm256_add_pd(vx2, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx5 = _mm256_load_pd(out+offset+s+40); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+32); vx5 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+40, vx5); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+48); vx7 = _mm256_sub_pd(vx6, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx6 = _mm256_add_pd(vx6, vt); // _mm256_store_pd(out+offset+s+48, vx6); // vx2 = _mm256_load_pd(out+offset+s+16); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); // vx6 = _mm256_load_pd(out+offset+s+48); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx4 = _mm256_load_pd(out+offset+s+32); vx6 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+48, vx6); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); // vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx3 = _mm256_sub_pd(vx1, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx1 = _mm256_add_pd(vx1, vt); // _mm256_store_pd(out+offset+s+8, vx1); // vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx5 = _mm256_load_pd(out+offset+s+40); vx7 = _mm256_sub_pd(vx5, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx5 = _mm256_add_pd(vx5, vt); // _mm256_store_pd(out+offset+s+40, vx5); // vx4 = _mm256_load_pd(out+offset+s+32); vo20 = _mm256_load_pd(o+pos+offset+24); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+32, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); // vx5 = _mm256_load_pd(out+offset+s+40); vo21 = _mm256_load_pd(o+pos+offset+32); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+40, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+8, vx1); // vx6 = _mm256_load_pd(out+offset+s+48); vo22 = _mm256_load_pd(o+pos+offset+40); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx2 = _mm256_load_pd(out+offset+s+16); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+48, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+16, vx2); // vx7 = _mm256_load_pd(out+offset+s+56); vo23 = _mm256_load_pd(o+pos+offset+48); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx3 = _mm256_load_pd(out+offset+s+24); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+56, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+24, vx3); } } pos += 56; // m = 128, m=256, m=512 (3 levels merged) for(offset=0;offset<64;offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+64); vo11 = _mm256_load_pd(o+pos+offset+128); for(s = 0; s<POLY_DEG; s+=512) { vx1 = _mm256_load_pd(out+offset+s+64); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+64, vx1); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+128); vx3 = _mm256_sub_pd(vx2, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx2 = _mm256_add_pd(vx2, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx5 = _mm256_load_pd(out+offset+s+320); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+256); vx5 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+320, vx5); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+384); vx7 = _mm256_sub_pd(vx6, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx6 = _mm256_add_pd(vx6, vt); //_mm256_store_pd(out+offset+s+384, vx6); //vx2 = _mm256_load_pd(out+offset+s+128); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); //vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx3 = _mm256_sub_pd(vx1, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx1 = _mm256_add_pd(vx1, vt); //_mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx4 = _mm256_load_pd(out+offset+s+256); vx6 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+384, vx6); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); //vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx5 = _mm256_load_pd(out+offset+s+320); vx7 = _mm256_sub_pd(vx5, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx5 = _mm256_add_pd(vx5, vt); //_mm256_store_pd(out+offset+s+320, vx5); //vx4 = _mm256_load_pd(out+offset+s+256); vo20 = _mm256_load_pd(o+pos+offset+192); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+256, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); //vx5 = _mm256_load_pd(out+offset+s+320); vo21 = _mm256_load_pd(o+pos+offset+256); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+320, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vo22 = _mm256_load_pd(o+pos+offset+320); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx2 = _mm256_load_pd(out+offset+s+128); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+384, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+128, vx2); //vx7 = _mm256_load_pd(out+offset+s+448); vo23 = _mm256_load_pd(o+pos+offset+384); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx3 = _mm256_load_pd(out+offset+s+192); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+448, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+192, vx3); } } }
foo (double x1, double x2, double x3, double x4) { return _mm256_set_pd (x1, x2, x3, x4); }
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator) { AVX2TABLETYPE *mp; DBL sum = 0.0; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator == kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } const __m256d ONE_PD = _mm256_set1_pd(1); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d sumr = _mm256_setzero_pd(); __m256d sumr1 = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj); { sumr = _mm256_add_pd(sumr, sumr1); __m128d sumr_up = _mm256_extractf128_pd(sumr,1); sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up); sumr_up = _mm_hadd_pd(sumr_up,sumr_up); sum = _mm_cvtsd_f64(sumr_up); } if (noise_generator == kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ sum += 1.05242; sum *= 0.48985582; /*sum *= 0.5; sum += 0.5;*/ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } else { sum = sum + 0.5; /* range at this point -0.5 - 0.5... */ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } #if CHECK_FUNCTIONAL { DBL orig_sum = PortableNoise(EPoint, noise_generator); if (fabs(orig_sum - sum) >= EPSILON) { throw POV_EXCEPTION_STRING("Noise error"); } } #endif _mm256_zeroupper(); return (sum); }
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m256d v_tmp; __m256d v00, v01, v02, v03; __m256d v10, v11, v12, v13; __m256d v20, v21, v22, v23; __m256d v30, v31, v32, v33; __m256i v_cur_roots; __m256 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx1_root; int idx2; int idx3, idx3_root; int pad_root, pad, pad_r; idx1 = ((int) mem->e_sz) - 1; idx1_root = ((int) mem->r_sz); // the conventio is that iteration i, idx1 points to the first element of line i+1 e[idx1++] = q[n]; // pad contains the padding for row i+1 // for row n it's always 3 pad = 3; pad_root = 7; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx1_root -= 2*(n-i)+1 + pad_root; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx2 += pad; // padding of line i+1 // idx2 now points to the first element of the next line idx3 = idx1; idx3_root = idx1_root; pad_r = pad; for (r = i; r < n; ++r) { pad_r = (pad_r+1)&3; // padding of line r+1 idx1 = idx3; idx1_root = idx3_root; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; idx1_root++; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&15); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1_root] = r; } idx1++; idx1_root++; } v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r); for( ; idx2 < l_end; idx2 += 16 ) { v01 = _mm256_load_pd( &w[idx1 ] ); v11 = _mm256_load_pd( &w[idx1+ 4] ); v21 = _mm256_load_pd( &w[idx1+ 8] ); v31 = _mm256_load_pd( &w[idx1+12] ); v00 = _mm256_load_pd( &e[idx2 ] ); v01 = _mm256_add_pd( v01, v_tmp ); v10 = _mm256_load_pd( &e[idx2+ 4] ); v11 = _mm256_add_pd( v11, v_tmp ); v20 = _mm256_load_pd( &e[idx2+ 8] ); v21 = _mm256_add_pd( v21, v_tmp ); v30 = _mm256_load_pd( &e[idx2+12] ); v31 = _mm256_add_pd( v31, v_tmp ); v01 = _mm256_add_pd( v01, v00 ); v03 = _mm256_load_pd( &e[idx1 ] ); v11 = _mm256_add_pd( v11, v10 ); v13 = _mm256_load_pd( &e[idx1+ 4] ); v21 = _mm256_add_pd( v21, v20 ); v23 = _mm256_load_pd( &e[idx1+ 8] ); v31 = _mm256_add_pd( v31, v30 ); v33 = _mm256_load_pd( &e[idx1+12] ); v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ ); v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ ); v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ ); v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ ); _mm256_maskstore_pd( &e[idx1 ], _mm256_castpd_si256( v02 ), v01 ); _mm256_maskstore_pd( &e[idx1+ 4], _mm256_castpd_si256( v12 ), v11 ); v_rootmask0 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v02)), _mm256_cvtpd_ps(v12) , 1 ); _mm256_maskstore_pd( &e[idx1+ 8], _mm256_castpd_si256( v22 ), v21 ); _mm256_maskstore_pd( &e[idx1+12], _mm256_castpd_si256( v32 ), v31 ); v_rootmask1 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v22)), _mm256_cvtpd_ps(v32) , 1 ); _mm256_maskstore_ps( &root[idx1_root ], _mm256_castps_si256( v_rootmask0 ), _mm256_castsi256_ps( v_cur_roots ) ); _mm256_maskstore_ps( &root[idx1_root + 8], _mm256_castps_si256( v_rootmask1 ), _mm256_castsi256_ps( v_cur_roots ) ); idx1 += 16; idx1_root += 16; } idx2 += pad_r; idx3++; idx3_root++; } pad = (pad -1)&3; pad_root = (pad_root-1)&7; } // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[ ((n/4)+1)*4 - 1 ]; }
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords, const MIntArray& triangleVertices, const MPointArray& points, const MFloatVectorArray& normals, const MIntArray& sampleIds, double* alignedStorage, MPoint& origin, MVector& up, MVector& normal) { // Start with the recreated point and normal using the barycentric coordinates of the hit point. unsigned int hitIndex = weights.length()-1; #ifdef __AVX__ __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0, points[triangleVertices[0]], points[triangleVertices[1]], points[triangleVertices[2]], MPoint::origin); __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0, normals[triangleVertices[0]], normals[triangleVertices[1]], normals[triangleVertices[2]], MVector::zero); __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]); // Create the barycentric point and normal. __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV); // Then use the weighted adjacent data. for (unsigned int j = 0; j < hitIndex; j += 4) { __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3], normals[sampleIds[j]], normals[sampleIds[j+1]], normals[sampleIds[j+2]], normals[sampleIds[j+3]]); normalV = _mm256_add_pd(tempNormal, normalV); } _mm256_store_pd(alignedStorage, originV); origin.x = alignedStorage[0]; origin.y = alignedStorage[1]; origin.z = alignedStorage[2]; _mm256_store_pd(alignedStorage, normalV); normal.x = alignedStorage[0]; normal.y = alignedStorage[1]; normal.z = alignedStorage[2]; // Calculate the up vector const MPoint& pt1 = points[triangleVertices[0]]; const MPoint& pt2 = points[triangleVertices[1]]; __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x); __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x); p1 = _mm256_add_pd(p1, p2); __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5); p1 = _mm256_mul_pd(p1, half); __m256d upV = _mm256_sub_pd(p1, originV); _mm256_store_pd(alignedStorage, upV); up.x = alignedStorage[0]; up.y = alignedStorage[1]; up.z = alignedStorage[2]; #else MVector hitNormal; // Create the barycentric point and normal. for (int i = 0; i < 3; ++i) { origin += points[triangleVertices[i]] * coords[i]; hitNormal += MVector(normals[triangleVertices[i]]) * coords[i]; } // Use crawl data to calculate normal normal = hitNormal * weights[hitIndex]; for (unsigned int j = 0; j < hitIndex; j++) { normal += MVector(normals[sampleIds[j]]) * weights[j]; } // Calculate the up vector // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are // the two closest vertices in the triangle. up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin; #endif normal.normalize(); GetValidUp(weights, points, sampleIds, origin, normal, up); }