double compute_pi(size_t dt) { int i; double pi = 0.0; double delta = 1.0 / dt; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(delta); ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0); ymm4 = _mm256_setzero_pd(); for (i = 0; i <= dt - 4; i += 4) { ymm3 = _mm256_set1_pd(i * delta); ymm3 = _mm256_add_pd(ymm3, ymm2); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_add_pd(ymm0, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm3); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
Color3 evalFourier3(float * const coeffs[3], size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0f; double Y = 0, R = 0, B = 0; for (size_t i=0; i<nCoeffs; ++i) { Y += coeffs[0][i] * cosPhi_cur; R += coeffs[1][i] * cosPhi_cur; B += coeffs[2][i] * cosPhi_cur; double cosPhi_next = 2*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } double G = 1.39829f*Y -0.100913f*B - 0.297375f*R; return Color3((Float) R, (Float) G, (Float) B); #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), Y = _mm256_set_sd((double) coeffs[0][0]), R = _mm256_set_sd((double) coeffs[1][0]), B = _mm256_set_sd((double) coeffs[2][0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); Y = _mm256_add_pd(Y, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[0]+i)))); R = _mm256_add_pd(R, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[1]+i)))); B = _mm256_add_pd(B, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[2]+i)))); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } MM_ALIGN32 struct { double Y; double R; double B; double unused; } tmp; simd::hadd(Y, R, B, _mm256_setzero_pd(), (double *) &tmp); double G = 1.39829*tmp.Y -0.100913*tmp.B - 0.297375*tmp.R; return Color3((Float) tmp.R, (Float) G, (Float) tmp.B); #endif }
//for 20 depth void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 20; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 8; y += 1, ay++) { x = -2; for(int ax=0; ax < 8; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4])); __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4])); __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9); sum =_mm256_add_pd (vectorMult0, sum); __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8])); __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8])); __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4); sum =_mm256_add_pd (vectorMult2, sum); __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12])); __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12])); __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6); sum =_mm256_add_pd (vectorMult3, sum); __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16])); __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16])); __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8); sum =_mm256_add_pd (vectorMult4, sum); } } } for(int i = 0; i < 4; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM0 = _mm256_add_pd(YMM0, YMM15); YMM1 = _mm256_add_pd(YMM1, YMM15); _mm256_storeu_pd(y+i, YMM0); _mm256_storeu_pd(y+i+4, YMM1); } for (; i<(n); i++) { y[i] = x[i] + c; } }
int main(int, char**) { volatile __m256d a = _mm256_setzero_pd(); volatile __m256d b = _mm256_set1_pd(42.42); volatile __m256d result = _mm256_add_pd(a, b); (void)result; return 0; }
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c) { __m256d a_coli, bi0, bi1, bi2, bi3; __m256d c_col0, c_col1, c_col2, c_col3; /* layout of 4x4 c matrix 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 */ double* c01_ptr = c + lda; double* c02_ptr = c01_ptr + lda; double* c03_ptr = c02_ptr + lda; // load old value of c c_col0 = _mm256_loadu_pd(c); c_col1 = _mm256_loadu_pd(c01_ptr); c_col2 = _mm256_loadu_pd(c02_ptr); c_col3 = _mm256_loadu_pd(c03_ptr); // for every column of a (or every row of b) for (int i = 0; i < K; ++i) { a_coli = _mm256_load_pd(a); a += 4; bi0 = _mm256_broadcast_sd(b++); bi1 = _mm256_broadcast_sd(b++); bi2 = _mm256_broadcast_sd(b++); bi3 = _mm256_broadcast_sd(b++); c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0)); c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1)); c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2)); c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3)); } _mm256_storeu_pd(c, c_col0); _mm256_storeu_pd(c01_ptr, c_col1); _mm256_storeu_pd(c02_ptr, c_col2); _mm256_storeu_pd(c03_ptr, c_col3); }
inline void kernel(adouble* v1, adouble * v2, int m) { __m256d alpha = _mm256_set1_pd(0.25); // __m256d phi_e = _mm256_loadu_pd (v1 + 1 ); __m256d phi_w = _mm256_loadu_pd (v1 - 1 ); __m256d phi_n = _mm256_loadu_pd (v1 + m); __m256d phi_s = _mm256_loadu_pd (v1 - m); // phi_e = _mm256_add_pd(phi_e, phi_s); phi_e = _mm256_add_pd(phi_e, phi_n); //phi_e = _mm_fmadd_pd(alpha, phi_e, phi_w); phi_e = _mm256_add_pd(phi_e, phi_w); phi_e = _mm256_mul_pd(alpha, phi_e); // //printf("-> p = %p\n", &v2[0]); _mm256_stream_pd(v2, phi_e); }
double hadd(const vector4d& rhs) { // rhs = (x0, x1, x2, x3) // tmp = (x2, x3, x0, x1) __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); // tmp = (x2+x0, x3+x1, -, -) tmp = _mm256_add_pd(rhs, tmp); // tmp = (x2+x0+x3+x1, -, -, -) tmp = _mm256_hadd_pd(tmp, tmp); return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); }
inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v) { float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0); float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1); float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2); float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3); float64x4_t add0 = _mm256_add_pd(mul0, mul1); float64x4_t add1 = _mm256_add_pd(mul2, mul3); float64x4_t add2 = _mm256_add_pd(add0, add1); return add2; }
void sum_avx(double* c, double* a, double* b, int len) { __m256d rA_AVX, rB_AVX, rC_AVX; // variables for AVX for (int i = 0; i < len; i += 4) { rA_AVX = _mm256_load_pd(&a[i]); rB_AVX = _mm256_load_pd(&b[i]); rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX); _mm256_store_pd(&c[i], rC_AVX); } }
ALGEBRA_INLINE double vector_ps_double (const double* pa,const double* pb,size_t n) { if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { size_t q = n/4; size_t r = n%4; double w = 0; if(q>0) { __m256d acc = _mm256_setzero_pd(); __m256d i1 = _mm256_load_pd(pa); __m256d j1 = _mm256_load_pd(pb); pa += 4; pb += 4; __m256d s = _mm256_mul_pd(i1, j1); acc = _mm256_add_pd(acc, s); while(--q != 0) { // load i1 = _mm256_load_pd(pa); j1 = _mm256_load_pd(pb); pa += 4; pb += 4; // multiplie s = _mm256_mul_pd(i1, j1); // accumule acc = _mm256_add_pd(acc, s); } // sum finale // add horizontal acc = _mm256_hadd_pd(acc, acc); // échange 128bits haut et bas __m256d accp = _mm256_permute2f128_pd(acc, acc, 1); // add vertical acc = _mm256_add_pd(acc, accp); // extract _mm_store_sd(&w, _mm256_extractf128_pd(acc,0)); } return w + vector_ps_double_basic(pa, pb, r); } return vector_ps_double_basic(pa, pb, n); }
Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0, value = 0.0; for (size_t i=0; i<nCoeffs; ++i) { value += coeffs[i] * cosPhi_cur; double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } return (Float) value; #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), value = _mm256_set_sd((double) coeffs[0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i)); __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff)); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } return (Float) simd::hadd(value); #endif }
inline vector4d haddp(const vector4d* row) { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); }
double compute_pi_leibniz_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); for (int i = 0; i <= n - 4; i += 4) { ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm4 = _mm256_mul_pd(ymm4, ymm1); ymm4 = _mm256_add_pd(ymm4, ymm2); ymm4 = _mm256_div_pd(ymm3, ymm4); ymm0 = _mm256_add_pd(ymm0, ymm4); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
// this function assumes data is stored in col-major // if data is in row major, call it like matmul4x4(B, A, C) void matmul4x4(double *A, double *B, double *C) { __m256d col[4], sum[4]; //load every column into registers for(int i=0; i<4; i++) col[i] = _mm256_load_pd(&A[i*4]); for(int i=0; i<4; i++) { sum[i] = _mm256_setzero_pd(); for(int j=0; j<4; j++) { sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]); } } for(int i=0; i<4; i++) _mm256_store_pd(&C[i*4], sum[i]); }
void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-4); i+=4) { YMM0 = _mm256_loadu_pd(y+i); YMM1 = _mm256_loadu_pd(x+i); YMM2 = _mm256_mul_pd(YMM0, YMM15); YMM3 = _mm256_add_pd(YMM1, YMM2); _mm256_storeu_pd(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
void jacobi_avx(GRID_T *oldGrid, GRID_T *newGrid, int width, int height){ int remainder; remainder = (width-2)%4; /* Each vector contains one value of the four Jacobi iteration step * Either each upper, below, left or right value. */ __m256d up_row, below_row, right_row, left_row; __m256d factor = _mm256_set1_pd(0.25); for(int i = 1; i < height-1; i++){ for(int j = 1; j < width-4; j += 4){ up_row = _mm256_loadu_pd(&(oldGrid[(i-1)*width + j])); below_row = _mm256_loadu_pd(&(oldGrid[(i+1)*width + j])); right_row = _mm256_loadu_pd(&(oldGrid[i*width + (j+1)])); left_row = _mm256_loadu_pd(&(oldGrid[i*width + (j-1)])); /* Sum up n-th element of each vector */ __m256d dest; __m256d add_1 = _mm256_add_pd(up_row, below_row); __m256d add_2 = _mm256_add_pd(left_row, right_row); dest = _mm256_add_pd(add_2, add_1); /* Multiplicat with 0.25 */ dest = _mm256_mul_pd(dest, factor); // Use unaligned store method. Normal one produces segmentation fault _mm256_storeu_pd(&(newGrid[i*width + j]), dest); } for(int j = width - remainder - 1; j < width -1; j++){ newGrid[i*width + j] = (oldGrid[i*width + (j-1)] + oldGrid[i*width + (j+1)] + oldGrid[(i-1)*width + j] + oldGrid[(i+1)*width + j]) * 0.25; } } return; }
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256d dd = _mm256_set1_pd (scale); __m256d XX0 = _mm256_set1_pd (X0); for (unsigned j = YSTART; j < SY; j++) { __m256d y0 = _mm256_set1_pd (j*scale + Y0); for (unsigned i = 0; i < SX; i += 4) { __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3); __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0); __m256d x = x0; __m256d y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256d x2 = _mm256_mul_pd (x, x); __m256d y2 = _mm256_mul_pd (y, y); __m256d abs = _mm256_add_pd (x2, y2); __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi64 (counts, cmp_mask); __m256d t = _mm256_add_pd (x, x); y = _mm256_fmadd_pd (t, y, y0); x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8)); *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16); out += 4; } } }
double compute_pi_leibniz_avx_opt(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256d ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0); ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0); ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0); ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0); ymm13 = _mm256_set1_pd(32.0); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); for (int i = 0; i <= n - 16; i += 16) { ymm9 = _mm256_div_pd(ymm0, ymm1); ymm1 = _mm256_add_pd(ymm1, ymm13); ymm10 = _mm256_div_pd(ymm0, ymm2); ymm2 = _mm256_add_pd(ymm2, ymm13); ymm11 = _mm256_div_pd(ymm0, ymm3); ymm3 = _mm256_add_pd(ymm3, ymm13); ymm12 = _mm256_div_pd(ymm0, ymm4); ymm4 = _mm256_add_pd(ymm4, ymm13); ymm5 = _mm256_add_pd(ymm5, ymm9); ymm6 = _mm256_add_pd(ymm6, ymm10); ymm7 = _mm256_add_pd(ymm7, ymm11); ymm8 = _mm256_add_pd(ymm8, ymm12); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
void SimpleClean::PartialSubtractImageAVX(double *image, size_t imgWidth, size_t imgHeight, const double *psf, size_t psfWidth, size_t psfHeight, size_t x, size_t y, double factor, size_t startY, size_t endY) { size_t startX, endX; int offsetX = (int) x - psfWidth/2, offsetY = (int) y - psfHeight/2; if(offsetX > 0) startX = offsetX; else startX = 0; if(offsetY > (int) startY) startY = offsetY; endX = std::min(x + psfWidth/2, imgWidth); size_t unAlignedCount = (endX - startX) % 4; endX -= unAlignedCount; endY = std::min(y + psfHeight/2, endY); const __m256d mFactor = _mm256_set1_pd(-factor); for(size_t ypos = startY; ypos < endY; ++ypos) { double *imageIter = image + ypos * imgWidth + startX; const double *psfIter = psf + (ypos - offsetY) * psfWidth + startX - offsetX; for(size_t xpos = startX; xpos != endX; xpos+=4) { __m256d imgVal = _mm256_loadu_pd(imageIter), psfVal = _mm256_loadu_pd(psfIter); #ifdef __FMA4__ _mm256_storeu_pd(imageIter, _mm256_fmadd_pd(psfVal, mFactor, imgVal)); #else _mm256_storeu_pd(imageIter, _mm256_add_pd(imgVal, _mm256_mul_pd(psfVal, mFactor))); #endif imageIter+=4; psfIter+=4; } for(size_t xpos = endX; xpos!=endX + unAlignedCount; ++xpos) { *imageIter -= *psfIter * factor; ++imageIter; ++psfIter; } } }
irreg_poly_area_func_sign(double, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256d curr, forw, coef_0, coef_1, end = _mm256_load_pd((const double *)cords), accum_sum = _mm256_setzero_pd(); double accum_sum_aux; unsigned long index; for (index = 0; index < (cords_len - 4); index += 4) { curr = end; // x0,y0,x1,y1 forw = _mm256_load_pd((const double *)&cords[index + 2]); // x2,y2,x3,y3 end = _mm256_load_pd((const double *)&cords[index + 4]); // x4,y4,x5,y5 coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3 coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4 //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3 accum_sum = _mm256_add_pd( accum_sum, _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4 _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3 _mm256_permute2f128_pd(curr, forw, 0b00100000), // x0, y0, x2, y2 _mm256_shuffle_pd(coef_0, coef_0, 0b0101) // y1, x1, y3, x3 ), _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4 // ^^^^^^^^^^^^^^^ x1*y2, y1*x2, x3*y4, y3*x4 ) ); } accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1 accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ... for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img] //rhs = [y1.real, y1.img, y2.real, y2.img] //ymm0 = [y1.real, y1.real, y2.real, y2.real] __m256d ymm0 = _mm256_movedup_pd(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101); //ymm4 = [x.img * y.img, x.real * y.img] __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4); #else __m256d t1 = _mm256_mul_pd(lhs.value, ymm0); __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4); __m256d ymm5 = _mm256_addsub_pd(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3); #else __m256d t3 = _mm256_mul_pd(ymm0, ymm0); ymm0 = _mm256_add_pd(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_pd(ymm5, ymm0); }
double compute_pi_euler_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(1.0); ymm2 = _mm256_set1_pd(6.0); for (int i = 0; i <= n - 4; i += 4) { ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm0 = _mm256_add_pd(ymm0, ymm3); } ymm3 = _mm256_mul_pd(ymm2, ymm0); double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sqrt( pi ); }
inline void addGrids( complexd dst[] , const complexd srcs[] , int nthreads , int grid_pitch , int grid_size ) { int siz = grid_size*grid_pitch; #pragma omp parallel for for (unsigned int i = 0; i < siz*sizeof(complexd)/(256/8); i++) { __m256d sum = as256pc(srcs)[i]; // __m256d sum = _mm256_loadu_pd(reinterpret_cast<const double*>(as256pc(srcs)+i)); for (int g = 1; g < nthreads; g ++) sum = _mm256_add_pd(sum, as256pc(srcs + g * siz)[i]); as256p(dst)[i] = sum; } }
/* sum double vectors ---------------------------------------------------------- * sum double vectors: out=data1.+data2 * args : double *data1 I input double array * double *data2 I input double array * int n I number of input data * double *out O output double array * return : none * note : AVX command is used if "AVX" is defined *-----------------------------------------------------------------------------*/ extern void sumvd(const double *data1, const double *data2, int n, double *out) { int i; #if !defined(AVX_ENABLE) for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; #else int m=n/4; __m256d xmm1,xmm2,xmm3; if (n<8) { for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; } else { for (i=0;i<4*m;i+=4) { xmm1=_mm256_loadu_pd(&data1[i]); xmm2=_mm256_loadu_pd(&data2[i]); xmm3=_mm256_add_pd(xmm1,xmm2); _mm256_storeu_pd(&out[i],xmm3); } for (;i<n;i++) out[i]=data1[i]+data2[i]; } #endif }
void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 16; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 32; y += 1, ay++) { x = -2; for(int ax=0; ax < 32; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); } } } for(int i = 0; i < 3; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
void ks_tanh_int_d8x4( int k, int rhs, double *h, // NOP double *u, double *aa, // NOP double *a, double *bb, // NOP double *b, double *w, double *c, ks_t *ker, aux_t *aux ) { int i, rhs_left; double scal = ker->scal; double cons = ker->cons; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t u03, u47; v4df_t a03, a47, A03, A47; // prefetched A v4df_t b0, b1, b2, b3, B0; // prefetched B v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp; // Rank-k update segment #include "ks_rank_k_int_d8x4.h" // Accumulate if ( aux->pc ) { tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); } // Scale before the kernel evaluation c_tmp.v = _mm256_broadcast_sd( &scal ); c03_0.v = _mm256_mul_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( c_tmp.v, c47_3.v ); // Shift before the kernel evaluation c_tmp.v = _mm256_broadcast_sd( &cons ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch u and w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u + 8 ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // c = tanh( c ); c03_0.v = _mm256_tanh_pd( c03_0.v ); c03_1.v = _mm256_tanh_pd( c03_1.v ); c03_2.v = _mm256_tanh_pd( c03_2.v ); c03_3.v = _mm256_tanh_pd( c03_3.v ); c47_0.v = _mm256_tanh_pd( c47_0.v ); c47_1.v = _mm256_tanh_pd( c47_1.v ); c47_2.v = _mm256_tanh_pd( c47_2.v ); c47_3.v = _mm256_tanh_pd( c47_3.v ); // Multiple rhs kernel summation. #include "ks_kernel_summation_int_d8x4.h" }
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs, Float sample, Float &pdf, Float &phi) { bool flip = false; if (sample < 0.5f) { sample *= 2.0f; } else { sample = 1.0f - 2.0f * (sample - 0.5f); flip = true; } int iterations = 0; double a = 0.0, c = math::Pi_d, coeff0 = coeffs[0][0], y = coeff0*math::Pi_d*sample, deriv = 0.0, b = 0.5 * math::Pi_d, cosB = 0, sinB = 1; if (nCoeffs > 10 && sample != 0 && sample != 1) { float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2])); if (std::isfinite(stddev)) { b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev); cosB = std::cos(b); sinB = std::sqrt(1 - cosB * cosB); } } #if FOURIER_SCALAR != 1 __m256d factorB_prev, factorB_cur; #endif while (true) { #if FOURIER_SCALAR == 1 double cosB_prev = cosB, sinB_prev = -sinB, sinB_cur = 0.0, cosB_cur = 1.0, value = coeff0 * b; deriv = coeff0; for (size_t j=1; j<nCoeffs; ++j) { double sinB_next = 2.0*cosB*sinB_cur - sinB_prev, cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeff = (double) coeffs[0][j]; value += coeff * recip[j] * sinB_next; deriv += coeff * cosB_next; sinB_prev = sinB_cur; sinB_cur = sinB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else initializeRecurrence(cosB, factorB_prev, factorB_cur); __m256d sinB_prev = _mm256_set1_pd(-sinB), sinB_cur = _mm256_set1_pd(0.0), cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), value_vec = _mm256_set_sd(coeff0 * b), deriv_vec = _mm256_set_sd(coeff0); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j); __m256d recip_vec = _mm256_load_pd(recip+j); __m256d coeff_vec = _mm256_cvtps_pd(coeff_vec_f); __m256d sinB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, sinB_prev), _mm256_mul_pd(factorB_cur, sinB_cur)); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd( _mm256_mul_pd(recip_vec, coeff_vec), sinB_next)); deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next)); sinB_prev = _mm256_splat2_pd(sinB_next); cosB_prev = _mm256_splat2_pd(cosB_next); sinB_cur = _mm256_splat3_pd(sinB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double value = simd::hadd(value_vec); deriv = simd::hadd(deriv_vec); #endif value -= y; if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20) break; else if (value > 0.0) c = b; else a = b; b -= value / deriv; if (!(b >= a && b <= c)) b = 0.5f * (a + c); cosB = std::cos(b); sinB = std::sqrt(1-cosB*cosB); } double Y = deriv; if (flip) b = 2.0*math::Pi_d - b; pdf = (Float) (math::InvTwoPi_d * Y / coeff0); phi = (Float) b; #if FOURIER_SCALAR == 1 double cosB_prev = cosB, cosB_cur = 1.0; double R = coeffs[1][0]; double B = coeffs[2][0]; for (size_t j=1; j<nCoeffs; ++j) { double cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeffR = (double) coeffs[1][j], coeffB = (double) coeffs[2][j]; R += coeffR * cosB_next; B += coeffB * cosB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else __m256d cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), R_vec = _mm256_set_sd(coeffs[1][0]), B_vec = _mm256_set_sd(coeffs[2][0]); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j); __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j); __m256d coeff_R_vec = _mm256_cvtps_pd(coeff_R_vec_f); __m256d coeff_B_vec = _mm256_cvtps_pd(coeff_B_vec_f); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next)); B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next)); cosB_prev = _mm256_splat2_pd(cosB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double R = simd::hadd(R_vec); double B = simd::hadd(B_vec); #endif double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R; return Color3((Float) R, (Float) G, (Float) B) * (2 * math::Pi) * (Float) (coeff0 / Y); }
int simd_chol(double *A, int n){ register int i; register int j; register int k; register int local_size = n; register __m256d v1, v2, v3, v4, mul1, mul2, sum; for (j = 0; j < local_size; j++) { for (i = j; i < local_size; i++) { register double Aij = A[IDX(i, j, local_size)]; if (j > 8) for (k = 0; k < j;) { if (k < j - 8){ v1 = _mm256_loadu_pd(A+IDX(i, k, local_size)); v2 = _mm256_loadu_pd(A+IDX(j, k, local_size)); mul1 = _mm256_mul_pd(v1, v2); v3 = _mm256_loadu_pd(A+IDX(i, k + 4, local_size)); v4 = _mm256_loadu_pd(A+IDX(j, k + 4, local_size)); mul2 = _mm256_mul_pd(v3, v4); sum = _mm256_add_pd(mul1, mul2); Aij -= (sum[3] + sum[2] + sum[1] + sum[0]); k = k + 8; } else { Aij -= A[IDX(i, k, local_size)] * A[IDX(j, k, local_size)]; k++; } } // i <= 8 else for (k = 0; k < j; ++k) Aij -= A[IDX(i, k, local_size)] * A[IDX(j, k, local_size)]; A[IDX(i, j, local_size)] = Aij; } if (A[IDX(j, j, local_size)] < 0.0) { return (1); } A[IDX(j, j, local_size)] = sqrt(A[IDX(j, j, local_size)]); register double Ajj = A[IDX(j, j, n)]; for (i = j + 1; i < local_size;){ if (i < local_size - 8){ A[IDX(i, j, local_size)] /= Ajj; A[IDX(i + 1, j, local_size)] /= Ajj; A[IDX(i + 2, j, local_size)] /= Ajj; A[IDX(i + 3, j, local_size)] /= Ajj; A[IDX(i + 4, j, local_size)] /= Ajj; A[IDX(i + 5, j, local_size)] /= Ajj; A[IDX(i + 6, j, local_size)] /= Ajj; A[IDX(i + 7, j, local_size)] /= Ajj; i += 8; } else { A[IDX(i, j, local_size)] /= Ajj; i++; } } } return (0); }
/* Naive implementation of Matrix Matrix Multiplication @param A input matrix @param B input matrix @param C output matrix */ inline void naive(const Matrix& A, const Matrix& B, Matrix& C){ //preload dimensions for faster access int dimM = C.getDimM(); int dimN = C.getDimN(); int dimL = A.getDimN(); for (int m = 0; m < dimM; m+=4){ ///rows of c for (int n = 0; n < dimN; n+=4){ ///cols of c //do calculation of a 4x4 block //std::cout << m << "\t" << n << std::endl; __m256d* pA = A.get(m, 0); __m256d* pB = A.get(m+1, 0); __m256d* pC = A.get(m+2, 0); __m256d* pD = A.get(m+3, 0); __m256d* pK = B.getT(0, n); __m256d* pL = B.getT(0, n+1); __m256d* pM = B.getT(0, n+2); __m256d* pN = B.getT(0, n+3); //std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl; __m256d K = _mm256_setzero_pd(); __m256d L = _mm256_setzero_pd(); __m256d M = _mm256_setzero_pd(); __m256d N = _mm256_setzero_pd(); __m256d O = _mm256_setzero_pd(); __m256d P = _mm256_setzero_pd(); __m256d Q = _mm256_setzero_pd(); __m256d R = _mm256_setzero_pd(); __m256d S = _mm256_setzero_pd(); __m256d T = _mm256_setzero_pd(); __m256d U = _mm256_setzero_pd(); __m256d V = _mm256_setzero_pd(); __m256d W = _mm256_setzero_pd(); __m256d X = _mm256_setzero_pd(); __m256d Y = _mm256_setzero_pd(); __m256d Z = _mm256_setzero_pd(); for (int l = 0; l < dimL; l+=4){ //std::cout <<"mul" << std::endl; K = K + (*pA) * (*pK); L = L + (*pA) * (*pL); M = M + (*pA) * (*pM); N = N + (*pA) * (*pN); O = O + (*pB) * (*pK); P = P + (*pB) * (*pL); Q = Q + (*pB) * (*pM); R = R + (*pB) * (*pN); S = S + (*pC) * (*pK); T = T + (*pC) * (*pL); U = U + (*pC) * (*pM); V = V + (*pC) * (*pN); W = W + (*pD) * (*pK); X = X + (*pD) * (*pL); Y = Y + (*pD) * (*pM); Z = Z + (*pD) * (*pN); //std::cout << "inc" <<std::endl; pA++; pB++; pC++; pD++; pK++; pL++; pM++; pN++; } // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} __m256d sumab = _mm256_hadd_pd(K, L); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} __m256d sumcd = _mm256_hadd_pd(M, N); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); __m256d sum = _mm256_add_pd(perm, blend); C.set(m, n, sum); //C(m , n) = K[0] + K[1] + K[2] + K[3]; //C(m , n+1) = L[0] + L[1] + L[2] + L[3]; //C(m , n+2) = M[0] + M[1] + M[2] + M[3]; //C(m , n+3) = N[0] + N[1] + N[2] + N[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(O, P); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Q, R); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+1, n, sum); //C(m+1, n ) = O[0] + O[1] + O[2] + O[3]; //C(m+1, n+1) = P[0] + P[1] + P[2] + P[3]; //C(m+1, n+2) = Q[0] + Q[1] + Q[2] + Q[3]; //C(m+1, n+3) = R[0] + R[1] + R[2] + R[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(S, T); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(U, V); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+2, n, sum); //C(m+2, n ) = S[0] + S[1] + S[2] + S[3]; //C(m+2, n+1) = T[0] + T[1] + T[2] + T[3]; //C(m+2, n+2) = U[0] + U[1] + U[2] + U[3]; //C(m+2, n+3) = V[0] + V[1] + V[2] + V[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(W, X); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Y, Z); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+3, n, sum); //C(m+3, n ) = W[0] + W[1] + W[2] + W[3]; //C(m+3, n+1) = X[0] + X[1] + X[2] + X[3]; //C(m+3, n+2) = Y[0] + Y[1] + Y[2] + Y[3]; //C(m+3, n+3) = Z[0] + Z[1] + Z[2] + Z[3]; } } }