C++ (Cpp) _mm256_mul_pd Examples

Example #1

0

Show file

File: convert_simd_avx.c Project: SudarshanRaj/SIMD-Experiments

void convert_simd_avx(const int32_t * u, double * y, size_t n, double slope)
{
	const int32_t * u_end = u + n;
	const int32_t * u_current = u;
	double * y_current = y;
	
	__m128i mmx_u1, mmx_u2;
    __m256d mmx_y1, mmx_y2, mmx_y3, mmx_y4;
    __m256d mmx_slope_4 = _mm256_set1_pd(slope);
	
	{
		for (; u_current < u_end; u_current += 8, y_current += 8)
		{
			/* Load 8 input values into an SSE register */
			mmx_u1 = _mm_load_si128(  (const __m128i *) u_current);
			mmx_u2 = _mm_load_si128(  (const __m128i *)  u_current+4);
		
			mmx_y1 = _mm256_cvtepi32_pd(mmx_u1);
			mmx_y2 = _mm256_cvtepi32_pd(mmx_u2);
			
			mmx_y3 = _mm256_mul_pd(mmx_y1, mmx_slope_4);    /* Apply slope */
			mmx_y4 = _mm256_mul_pd(mmx_y2, mmx_slope_4);    /* Apply slope */
			
			_mm256_store_pd(y_current, mmx_y3);
			_mm256_store_pd(y_current+4, mmx_y4);			
		}
	}
	
}

Example #2

0

Show file

File: dVect.cpp Project: smgogarten/SNPRelate

// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}

Example #3

0

Show file

File: mul.c Project: 123raoul123/FHEW_in_c

/******************************************************************
*
* NEGACYCLIC FFT LOOK UP TABLE
*
******************************************************************/
void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y)
{
  phi_forward(&vector_x,x);
  phi_forward(&vector_y,y);

  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim;
  dim = _mm256_set1_pd(CPLXDIM);
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vector_x.real+i);
    imag_x = _mm256_load_pd(vector_x.imag+i);
    real_y = _mm256_load_pd(vector_y.real+i);
    imag_y = _mm256_load_pd(vector_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    
    real_x = _mm256_div_pd(real_x,dim);
    imag_x = _mm256_div_pd(imag_x,dim);

    _mm256_store_pd(vector_res.real+i,real_x);
    _mm256_store_pd(vector_res.imag+i,imag_x);
  }
  phi_backward(&vector_res,r);
  // print_cplx(&vec_res,CPLXDIM);
}

Example #4

0

Show file

File: avx_vectorization.hpp Project: wichtounet/etl

    mul(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm1 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm1 = _mm256_movedup_pd(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm3 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm4 = ymm2 * ymm3
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm3);

        //result = [(lhs * ymm1) -+ ymm4];

#ifdef __FMA__
        return _mm256_fmaddsub_pd(lhs.value, ymm1, ymm4);
#elif defined(__FMA4__)
        return _mm256_maddsub_pd(lhs.value, ymm1, ymm4);
#else
        __m256d tmp = _mm256_mul_pd(lhs.value, ymm1);
        return _mm256_addsub_pd(tmp, ymm4);
#endif
    }

Example #5

0

Show file

File: mul.c Project: 123raoul123/FHEW_in_c

/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){
  // printf("\n\n**************split-radix FAST**************\n");

  fft_vector_forward(&vctr_x,x);
  fft_vector_forward(&vctr_y,y);
  
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vctr_x.real+i);
    imag_x = _mm256_load_pd(vctr_x.imag+i);
    real_y = _mm256_load_pd(vctr_y.real+i);
    imag_y = _mm256_load_pd(vctr_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
     
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vctr_res.real+i,real_x);
    _mm256_store_pd(vctr_res.imag+i,imag_x);
  }
  fft_vector_backward(&vctr_res,r);
}

Example #6

0

Show file

File: fourier.cpp Project: marwan-abdellah/layerlab

Color3 evalFourier3(float * const coeffs[3], size_t nCoeffs, Float phi) {
    #if FOURIER_SCALAR == 1
        double cosPhi      = std::cos((double) phi),
              cosPhi_prev = cosPhi,
              cosPhi_cur  = 1.0f;

        double Y = 0, R = 0, B = 0;

        for (size_t i=0; i<nCoeffs; ++i) {
            Y += coeffs[0][i] * cosPhi_cur;
            R += coeffs[1][i] * cosPhi_cur;
            B += coeffs[2][i] * cosPhi_cur;

            double cosPhi_next = 2*cosPhi*cosPhi_cur - cosPhi_prev;
            cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next;
        }

        double G = 1.39829f*Y -0.100913f*B - 0.297375f*R;

        return Color3((Float) R, (Float) G, (Float) B);
    #else
        double cosPhi = std::cos((double) phi);

        __m256d
            cosPhi_prev = _mm256_set1_pd(cosPhi),
            cosPhi_cur  = _mm256_set1_pd(1.0),
            Y           = _mm256_set_sd((double) coeffs[0][0]),
            R           = _mm256_set_sd((double) coeffs[1][0]),
            B           = _mm256_set_sd((double) coeffs[2][0]),
            factorPhi_prev, factorPhi_cur;

        initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur);

        for (size_t i=1; i<nCoeffs; i+=4) {
            __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev),
                    _mm256_mul_pd(factorPhi_cur,  cosPhi_cur));

            Y = _mm256_add_pd(Y, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[0]+i))));
            R = _mm256_add_pd(R, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[1]+i))));
            B = _mm256_add_pd(B, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[2]+i))));

            cosPhi_prev = _mm256_splat2_pd(cosPhi_next);
            cosPhi_cur = _mm256_splat3_pd(cosPhi_next);
        }

        MM_ALIGN32 struct {
            double Y;
            double R;
            double B;
            double unused;
        } tmp;

        simd::hadd(Y, R, B, _mm256_setzero_pd(), (double *) &tmp);

        double G = 1.39829*tmp.Y -0.100913*tmp.B - 0.297375*tmp.R;

        return Color3((Float) tmp.R, (Float) G, (Float) tmp.B);
    #endif
}

Example #7

0

Show file

File: cnn.c Project: MicBrain/Image-Classifier

//for 20 depth
void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 20; d++) {
      vol_t* f = l->filters[d];    
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 8; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 8; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20]));
                  __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20]));
                  __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                  sum =_mm256_add_pd (vectorMult, sum);
                  __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4]));
                  __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4]));
                  __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9);
                  sum =_mm256_add_pd (vectorMult0, sum);
                  __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8]));
                  __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8]));
                  __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4);
                  sum =_mm256_add_pd (vectorMult2, sum);
                  __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12]));
                  __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12]));
                  __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6);
                  sum =_mm256_add_pd (vectorMult3, sum);
                  __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16]));
                  __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16]));
                  __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8);
                  sum =_mm256_add_pd (vectorMult4, sum);
              }
            }
          }
          for(int i = 0; i < 4; i++) {
                  a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}

Example #8

0

Show file

File: avx_simd.c Project: hkuro/embedded-fall2015

double compute_pi(size_t dt)
{
    int i;
    double pi = 0.0;
    double delta = 1.0 / dt;
    register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
    ymm0 = _mm256_set1_pd(1.0);
    ymm1 = _mm256_set1_pd(delta);
    ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0);
    ymm4 = _mm256_setzero_pd();

    for (i = 0; i <= dt - 4; i += 4) {
        ymm3 = _mm256_set1_pd(i * delta);
        ymm3 = _mm256_add_pd(ymm3, ymm2);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_add_pd(ymm0, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);
        ymm4 = _mm256_add_pd(ymm4, ymm3);
    }
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm4);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return pi * 4.0;
}

Example #9

0

Show file

File: AVX.c Project: smfreegard/rspamd

void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
  __m256d YMM0, YMM1;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_pd(x+i);
    YMM1 = _mm256_loadu_pd(x+i+4);
    YMM0 = _mm256_mul_pd(YMM0, YMM15);
    YMM1 = _mm256_mul_pd(YMM1, YMM15);
    _mm256_storeu_pd(y+i, YMM0);
    _mm256_storeu_pd(y+i+4, YMM1);
  }
  for (; i<n; i++) {
    y[i] = x[i] * c;
  }
}

Example #10

0

Show file

File: perfgv.c Project: hrautila/armas

void gvrotg_fma(double *c, double *s, double *r, double a, double b)
{
#if defined(__FMA__)
    register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1;
    if (b == 0.0) {
        *c = 1.0;
        *s = 0.0;
        *r = a;
        return;
    }
    if (a == 0.0) {
        *c = 0.0;
        *s = 1.0;
        *r = b;
        return;
    }

    // set_pd() order: [3, 2, 1, 0]
    // x[0], x[1]: |a| > |b|,  x[2],x[3]: |b| > |a|

    one = _mm256_set1_pd(1.0);
    x0  = _mm256_set_pd(1.0, a, b, 1.0);   // x0 = {1, a,   b,   1}
    x1  = _mm256_set_pd(1.0, b, a, 1.0);   // x0 = {1, b,   a,   1}
    t0  = _mm256_div_pd(x0, x1);           // t0 = {1, a/b, b/a, 1}
    t2  = _mm256_fmadd_pd(t0, t0, one);    // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1}
    u0  = _mm256_sqrt_pd(t2);              // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1}
    u1  = _mm256_div_pd(one, u0);
    b0  = _mm256_blend_pd(u0, u1, 0x9);    // b0 = {1/u(a),   u(a),   u(b), 1/u(b)} 
    b0  = _mm256_mul_pd(b0, x1);           // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} 
    b1  = _mm256_mul_pd(t0, u1);           // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} 

    if (fabs(b) > fabs(a)) {
      *s = b0[3];
      *r = b0[2];
      *c = b1[2];
      if (signbit(b)) {
          *s = -(*s);
          *c = -(*c);
          *r = -(*r);
      }
    } else {
      *c = b0[0];
      *r = b0[1];
      *s = b1[1];
    }
#endif
}

Example #11

0

Show file

File: AVX.c Project: smfreegard/rspamd

void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256d YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_pd(x+i);
    YMM1 = _mm256_loadu_pd(x+i+4);
    YMM2 = _mm256_loadu_pd(y+i);
    YMM3 = _mm256_loadu_pd(y+i+4);
    YMM2 = _mm256_mul_pd(YMM0, YMM2);
    YMM3 = _mm256_mul_pd(YMM1, YMM3);
    _mm256_storeu_pd(z+i, YMM2);
    _mm256_storeu_pd(z+i+4, YMM3);
  }
  for (; i<n; i++) {
    z[i] = x[i] * y[i];
  }
}

Example #12

0

Show file

File: perfgv.c Project: hrautila/armas

void gvrotg_avx(double *c, double *s, double *r, double a, double b)
{
    register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1;
    if (b == 0.0) {
        *c = 1.0;
        *s = 0.0;
        *r = a;
        return;
    }
    if (a == 0.0) {
        *c = 0.0;
        *s = 1.0;
        *r = b;
        return;
    }

    // set_pd() order: [3, 2, 1, 0]
    // x[0], x[1]: |a| > |b|,  x[2],x[3]: |b| > |a|

    x0  = _mm256_set_pd(1.0, a, b, 1.0);   // x0 = {1, a,   b,   1}
    x1  = _mm256_set_pd(1.0, b, a, 1.0);   // x0 = {1, b,   a,   1}
    t0  = _mm256_div_pd(x0, x1);           // t0 = {1, a/b, b/a, 1}
    x0  = _mm256_mul_pd(t0, t0);           // x3 = {1, (a/b)^2, (b/a)^2, 1}
    t2  = _mm256_hadd_pd(x0, x0);          // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..}
    u0  = _mm256_sqrt_pd(t2);              // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)}
    one = _mm256_set1_pd(1.0);
    u1  = _mm256_div_pd(one, u0);
    b0  = _mm256_blend_pd(u0, u1, 0x9);    // b0 = {1/u(b),   u(b),   u(a), 1/u(a)} 
    b0  = _mm256_mul_pd(b0, x1);           // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} 
    b1  = _mm256_mul_pd(t0, u1);           // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} 

    if (fabs(b) > fabs(a)) {
      *s = b0[3];  // = 1/u(b)
      *r = b0[2];  // = b*u(b)
      *c = b1[2];  // = t*u(b)
      if (signbit(b)) {
          *s = -(*s);
          *c = -(*c);
          *r = -(*r);
      }
    } else {
      *c = b0[0];
      *r = b0[1];
      *s = b1[1];
    }
}

Example #13

0

Show file

File: func_geometric.hpp Project: XorNameName/Twist

		inline float64x4_t dot(const float64x4_t ymm1, const float64x4_t ymm2)
		{
			float64x4_t mul0 = _mm256_mul_pd(ymm1, ymm2);
			float64x4_t hadd0 = _mm256_hadd_pd(mul0, mul0);
			float64x2_t ext0 = _mm256_extractf128_pd(hadd0, 0);
			float64x2_t ext1 = _mm256_extractf128_pd(hadd0, 1);
			float64x2_t add0 = _mm_add_pd(ext0, ext1);
			return _mm256_broadcast_pd(&add0);
		}

Example #14

0

Show file

File: poly_area_avx.c Project: samyvilar/poly_area

irreg_poly_area_func_sign(double, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256d
        curr,
        forw,
        coef_0,
        coef_1,
        end = _mm256_load_pd((const double *)cords),
        accum_sum = _mm256_setzero_pd();
    double accum_sum_aux;

    unsigned long index;
    for (index = 0; index < (cords_len - 4); index += 4) {
        curr = end;                                                 // x0,y0,x1,y1
        forw = _mm256_load_pd((const double *)&cords[index + 2]);   // x2,y2,x3,y3
        end = _mm256_load_pd((const double *)&cords[index + 4]);    // x4,y4,x5,y5

        coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3
        coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4

        //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3
        accum_sum = _mm256_add_pd(
            accum_sum,
            _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4
                _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3
                    _mm256_permute2f128_pd(curr, forw, 0b00100000),  // x0, y0, x2, y2
                    _mm256_shuffle_pd(coef_0, coef_0, 0b0101)  // y1, x1, y3, x3
                ),
                _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4
                // ^^^^^^^^^^^^^^^  x1*y2, y1*x2, x3*y4, y3*x4
            )
        );
    }

    accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1
    accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ...
    for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}

Example #15

0

Show file

File: dgemm-blocked-memaligned.c Project: UltimateBeaver/matrix_multiplication

static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c)
{
  __m256d a_coli, bi0, bi1, bi2, bi3;
  __m256d c_col0, c_col1, c_col2, c_col3;

  /* layout of 4x4 c matrix
      00 01 02 03
      10 11 12 13
      20 21 22 23
      30 31 32 33
  */
  double* c01_ptr = c + lda;
  double* c02_ptr = c01_ptr + lda;
  double* c03_ptr = c02_ptr + lda;

  // load old value of c
  c_col0 = _mm256_loadu_pd(c);
  c_col1 = _mm256_loadu_pd(c01_ptr);
  c_col2 = _mm256_loadu_pd(c02_ptr);
  c_col3 = _mm256_loadu_pd(c03_ptr);

  // for every column of a (or every row of b)
  for (int i = 0; i < K; ++i) 
  {
    a_coli = _mm256_load_pd(a);
    a += 4;

    bi0 = _mm256_broadcast_sd(b++);
    bi1 = _mm256_broadcast_sd(b++);
    bi2 = _mm256_broadcast_sd(b++);
    bi3 = _mm256_broadcast_sd(b++);

    c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0));
    c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1));
    c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2));
    c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3));
  }

  _mm256_storeu_pd(c, c_col0);
  _mm256_storeu_pd(c01_ptr, c_col1);
  _mm256_storeu_pd(c02_ptr, c_col2);
  _mm256_storeu_pd(c03_ptr, c_col3);
}

Example #16

0

Show file

File: avx_vectorization.hpp Project: wichtounet/etl

    div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm0 = _mm256_movedup_pd(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
#else
        __m256d t1   = _mm256_mul_pd(lhs.value, ymm0);
        __m256d t2   = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
        __m256d ymm5 = _mm256_addsub_pd(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
#else
        __m256d t3   = _mm256_mul_pd(ymm0, ymm0);
        ymm0         = _mm256_add_pd(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_pd(ymm5, ymm0);
    }

Example #17

0

Show file

File: func_transform.hpp Project: XorNameName/Twist

		inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v)
		{
			float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y
			float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w

			float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x 
			float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y
			float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z
			float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w

			float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0);
			float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1);
			float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2);
			float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3);

			float64x4_t add0 = _mm256_add_pd(mul0, mul1);
			float64x4_t add1 = _mm256_add_pd(mul2, mul3);
			float64x4_t add2 = _mm256_add_pd(add0, add1);

			return add2;
		}

Example #18

0

Show file

File: compute_pi.c Project: Jayjack0116/compute_pi

double compute_pi_euler_avx(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3;
	ymm0 = _mm256_setzero_pd();
    ymm1 = _mm256_set1_pd(1.0);
    ymm2 = _mm256_set1_pd(6.0);

    for (int i = 0; i <= n - 4; i += 4) {
        ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);  
        ymm0 = _mm256_add_pd(ymm0, ymm3);
    }
    ymm3 = _mm256_mul_pd(ymm2, ymm0);
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm0);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return sqrt( pi );
}

Example #19

0

Show file

ALGEBRA_INLINE double	vector_ps_double (const double* pa,const double* pb,size_t n) {
    if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
        size_t q = n/4;
        size_t r = n%4;
        double w = 0;

        if(q>0) {
            __m256d acc = _mm256_setzero_pd();
            __m256d i1 = _mm256_load_pd(pa);
            __m256d j1 = _mm256_load_pd(pb);
            pa += 4;
            pb += 4;
            __m256d s = _mm256_mul_pd(i1, j1);
            acc = _mm256_add_pd(acc, s);

            while(--q != 0) {
                // load
                i1 = _mm256_load_pd(pa);
                j1 = _mm256_load_pd(pb);
                pa += 4;
                pb += 4;
                // multiplie
                s = _mm256_mul_pd(i1, j1);
                // accumule
                acc = _mm256_add_pd(acc, s);            
            }
            // sum finale
            // add horizontal
            acc = _mm256_hadd_pd(acc, acc);
            // échange 128bits haut et bas
            __m256d accp = _mm256_permute2f128_pd(acc, acc, 1);
            // add vertical
            acc = _mm256_add_pd(acc, accp);
            // extract
            _mm_store_sd(&w,  _mm256_extractf128_pd(acc,0));
        }
        return w + vector_ps_double_basic(pa, pb, r);
    }
    return vector_ps_double_basic(pa, pb, n);
}

Example #20

0

Show file

File: mul.c Project: 123raoul123/FHEW_in_c

/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){
  fft_vector_nonrec_forward(&vec_x,x);
  fft_vector_nonrec_forward(&vec_y,y);
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vec_x.real+i);
    imag_x = _mm256_load_pd(vec_x.imag+i);
    real_y = _mm256_load_pd(vec_y.real+i);
    imag_y = _mm256_load_pd(vec_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
    //REPLACED FOR COMMENTED SECTION
    //real_x = ac
    // real_x = _mm256_mul_pd(real_x,real_y);
    // //imag_x = bc
    // imag_x = _mm256_mul_pd(imag_x,real_y);
    // //real_x = ac - bd => real_x - real_temp
    // real_x = _mm256_sub_pd(real_x,real_temp);
    // //imag_x = ad + bc => imag_temp + imag_x
    // imag_x = _mm256_add_pd(imag_x,imag_temp);
    //THESE ARE NOT WORKING 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vec_res.real+i,real_x);
    _mm256_store_pd(vec_res.imag+i,imag_x);

  }
  fft_vector_nonrec_backward(&vec_res,r);
}

Example #21

0

Show file

File: fourier.cpp Project: marwan-abdellah/layerlab

Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) {
    #if FOURIER_SCALAR == 1
        double cosPhi      = std::cos((double) phi),
               cosPhi_prev = cosPhi,
               cosPhi_cur  = 1.0,
               value       = 0.0;

        for (size_t i=0; i<nCoeffs; ++i) {
            value += coeffs[i] * cosPhi_cur;

            double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev;
            cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next;
        }

        return (Float) value;
    #else
        double cosPhi = std::cos((double) phi);

        __m256d
            cosPhi_prev = _mm256_set1_pd(cosPhi),
            cosPhi_cur  = _mm256_set1_pd(1.0),
            value       = _mm256_set_sd((double) coeffs[0]),
            factorPhi_prev, factorPhi_cur;

        initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur);

        for (size_t i=1; i<nCoeffs; i+=4) {
            __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i));

            __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev),
                    _mm256_mul_pd(factorPhi_cur,  cosPhi_cur));
            value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff));
            cosPhi_prev = _mm256_splat2_pd(cosPhi_next);
            cosPhi_cur = _mm256_splat3_pd(cosPhi_next);
        }

        return (Float) simd::hadd(value);
    #endif
}

Example #22

0

Show file

File: main.cpp Project: CCJY/coliru

extern "C" void product32x32_avx(double *a, double *b, double *c, int n) 
{
    for(int i=0; i<n; i++) {	
		__m256d t1 = _mm256_loadu_pd(&c[i*n +  0]);
		__m256d t2 = _mm256_loadu_pd(&c[i*n +  4]);
		__m256d t3 = _mm256_loadu_pd(&c[i*n +  8]);
		__m256d t4 = _mm256_loadu_pd(&c[i*n + 12]);
		__m256d t5 = _mm256_loadu_pd(&c[i*n + 16]);
		__m256d t6 = _mm256_loadu_pd(&c[i*n + 20]);
		__m256d t7 = _mm256_loadu_pd(&c[i*n + 24]);
		__m256d t8 = _mm256_loadu_pd(&c[i*n + 28]);
		for(int k=0; k<n; k++) {
			__m256d a1 = _mm256_set1_pd(a[k*n+i]);
			
			__m256d b1 = _mm256_loadu_pd(&b[k*n+0]);
			t1 = _mm256_sub_pd(t1,_mm256_mul_pd(a1,b1));
			
			__m256d b2 = _mm256_loadu_pd(&b[k*n+4]);
			t2 = _mm256_sub_pd(t2,_mm256_mul_pd(a1,b2));

			__m256d b3 = _mm256_loadu_pd(&b[k*n+8]);
			t3 = _mm256_sub_pd(t3,_mm256_mul_pd(a1,b3));

			__m256d b4 = _mm256_loadu_pd(&b[k*n+12]);
			t4 = _mm256_sub_pd(t4,_mm256_mul_pd(a1,b4));

			__m256d b5 = _mm256_loadu_pd(&b[k*n+16]);
			t5 = _mm256_sub_pd(t5,_mm256_mul_pd(a1,b5));

			__m256d b6 = _mm256_loadu_pd(&b[k*n+20]);
			t6 = _mm256_sub_pd(t6,_mm256_mul_pd(a1,b6));

			__m256d b7 = _mm256_loadu_pd(&b[k*n+24]);
			t7 = _mm256_sub_pd(t7,_mm256_mul_pd(a1,b7));

			__m256d b8 = _mm256_loadu_pd(&b[k*n+28]);
			t8 = _mm256_sub_pd(t8,_mm256_mul_pd(a1,b8));
		}
		_mm256_storeu_pd(&c[i*n +  0], t1);
		_mm256_storeu_pd(&c[i*n +  4], t2);
		_mm256_storeu_pd(&c[i*n +  8], t3);
		_mm256_storeu_pd(&c[i*n + 12], t4);
		_mm256_storeu_pd(&c[i*n + 16], t5);
		_mm256_storeu_pd(&c[i*n + 20], t6);
		_mm256_storeu_pd(&c[i*n + 24], t7);
		_mm256_storeu_pd(&c[i*n + 28], t8);
	}
}

Example #23

0

Show file

File: dgemm-blocked-memaligned.c Project: UltimateBeaver/matrix_multiplication

// this function assumes data is stored in col-major
// if data is in row major, call it like matmul4x4(B, A, C)
void matmul4x4(double *A, double *B, double *C) {
    __m256d col[4], sum[4];
    //load every column into registers
    for(int i=0; i<4; i++)  
      col[i] = _mm256_load_pd(&A[i*4]);
    for(int i=0; i<4; i++) {
        sum[i] = _mm256_setzero_pd();      
        for(int j=0; j<4; j++) {
            sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]);
        }           
    }
    for(int i=0; i<4; i++) 
      _mm256_store_pd(&C[i*4], sum[i]); 
}

Example #24

0

Show file

File: fma.c Project: pzemtsov/MandelView

void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}

Example #25

0

Show file

File: AVX.c Project: smfreegard/rspamd

void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
  __m256d YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-4); i+=4) {
    YMM0 = _mm256_loadu_pd(y+i);
    YMM1 = _mm256_loadu_pd(x+i);
    YMM2 = _mm256_mul_pd(YMM0, YMM15);
    YMM3 = _mm256_add_pd(YMM1, YMM2);
    _mm256_storeu_pd(z+i, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] + y[i] * c;
  }
}

Example #26

0

Show file

File: simpleclean.cpp Project: kernsuite-debian/wsclean

void SimpleClean::PartialSubtractImageAVX(double *image, size_t imgWidth, size_t imgHeight, const double *psf, size_t psfWidth, size_t psfHeight, size_t x, size_t y, double factor, size_t startY, size_t endY)
{
	size_t startX, endX;
	int offsetX = (int) x - psfWidth/2, offsetY = (int) y - psfHeight/2;
	
	if(offsetX > 0)
		startX = offsetX;
	else
		startX = 0;
	
	if(offsetY > (int) startY)
		startY = offsetY;
	
	endX = std::min(x + psfWidth/2, imgWidth);
	
	size_t unAlignedCount = (endX - startX) % 4;
	endX -= unAlignedCount;
	
	endY = std::min(y + psfHeight/2, endY);
	
	const __m256d mFactor = _mm256_set1_pd(-factor);
	for(size_t ypos = startY; ypos < endY; ++ypos)
	{
		double *imageIter = image + ypos * imgWidth + startX;
		const double *psfIter = psf + (ypos - offsetY) * psfWidth + startX - offsetX;
		for(size_t xpos = startX; xpos != endX; xpos+=4)
		{
			__m256d
				imgVal = _mm256_loadu_pd(imageIter),
				psfVal = _mm256_loadu_pd(psfIter);
#ifdef __FMA4__
			_mm256_storeu_pd(imageIter, _mm256_fmadd_pd(psfVal, mFactor, imgVal));
#else
			_mm256_storeu_pd(imageIter, _mm256_add_pd(imgVal, _mm256_mul_pd(psfVal, mFactor)));
#endif
			imageIter+=4;
			psfIter+=4;
		}
		for(size_t xpos = endX; xpos!=endX + unAlignedCount; ++xpos)
		{
			*imageIter -= *psfIter * factor;
			++imageIter;
			++psfIter;
		}
	}
}

Example #27

0

Show file

File: HodgkinHuxley.cpp Project: davidbacisin/neuron-reliability

double HodgkinHuxley::dV(double *V, double I) {
	const double C = 1.0;
	const double gNa = 120.0;
	const double gK = 36.0;
	const double gL = 0.3;
	const double ENa = 50.0;
	const double EK = -77.0;
	const double EL = -54.4;
#ifdef __AVX__
/*
AVX is an instruction set from Intel which allows simultaneous operation
on 4 doubles. Seems to be slower than optimized FPU, though.
*/
	double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0},
		   Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0},
		   Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I};
	
	// load V
	__m256d Vr = _mm256_load_pd(Va);
	// load E
	__m256d Er = _mm256_load_pd(Ea);
	// load G
	__m256d Gr = _mm256_load_pd(Ga);
	// subtract
	Vr = _mm256_sub_pd(Vr, Er);
	// dot product (why does intel not have _mm256_dp_pd ?)
	Vr = _mm256_mul_pd(Vr, Gr);
	__m256d temp = _mm256_hadd_pd(Vr, Vr);
	__m128d lo128 = _mm256_extractf128_pd(temp, 0);
	__m128d hi128 = _mm256_extractf128_pd(temp, 1);
	__m128d dotproduct = _mm_add_pd(lo128, hi128);
	
	double sseVal;
	// store
	_mm_storel_pd(&sseVal, dotproduct);
	sseVal /= C;
		
	return sseVal;
#else
	return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa)
		- gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C;
#endif
}

Example #28

0

Show file

File: lap-avx2.c Project: ursache/HPC-hacks

inline
void kernel(adouble* v1, adouble * v2, int m)
{
	__m256d alpha = _mm256_set1_pd(0.25);
	//
	__m256d phi_e = _mm256_loadu_pd (v1 + 1 );
	__m256d phi_w = _mm256_loadu_pd (v1 - 1 );
	__m256d phi_n = _mm256_loadu_pd (v1 + m);
	__m256d phi_s = _mm256_loadu_pd (v1 - m);
	//
	phi_e = _mm256_add_pd(phi_e, phi_s);
	phi_e = _mm256_add_pd(phi_e, phi_n);
	//phi_e = _mm_fmadd_pd(alpha, phi_e, phi_w);
	phi_e = _mm256_add_pd(phi_e, phi_w);
	phi_e = _mm256_mul_pd(alpha, phi_e);
	//
	//printf("-> p = %p\n", &v2[0]);
	_mm256_stream_pd(v2, phi_e);

}

Example #29

0

Show file

File: compute_pi.c Project: Jayjack0116/compute_pi

double compute_pi_leibniz_avx(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;

	ymm0 = _mm256_setzero_pd();
	ymm1 = _mm256_set1_pd(2.0);
	ymm2 = _mm256_set1_pd(1.0);
	ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	
	for (int i = 0; i <= n - 4; i += 4) {
		ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
		ymm4 = _mm256_mul_pd(ymm4, ymm1);
		ymm4 = _mm256_add_pd(ymm4, ymm2);
		ymm4 = _mm256_div_pd(ymm3, ymm4);
		ymm0 = _mm256_add_pd(ymm0, ymm4);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm0);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}

Example #30

0

Show file

File: cnn.c Project: MicBrain/Image-Classifier

void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 16; d++) {
      vol_t* f = l->filters[d];
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 32; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 32; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3]));
                __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3]));
                __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                sum =_mm256_add_pd (vectorMult, sum);
              }
            }
          }
          for(int i = 0; i < 3; i++) {
            a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}