Ejemplo n.º 1
0
double compute_pi(size_t dt)
{
    int i;
    double pi = 0.0;
    double delta = 1.0 / dt;
    register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
    ymm0 = _mm256_set1_pd(1.0);
    ymm1 = _mm256_set1_pd(delta);
    ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0);
    ymm4 = _mm256_setzero_pd();

    for (i = 0; i <= dt - 4; i += 4) {
        ymm3 = _mm256_set1_pd(i * delta);
        ymm3 = _mm256_add_pd(ymm3, ymm2);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_add_pd(ymm0, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);
        ymm4 = _mm256_add_pd(ymm4, ymm3);
    }
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm4);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return pi * 4.0;
}
Ejemplo n.º 2
0
Color3 evalFourier3(float * const coeffs[3], size_t nCoeffs, Float phi) {
    #if FOURIER_SCALAR == 1
        double cosPhi      = std::cos((double) phi),
              cosPhi_prev = cosPhi,
              cosPhi_cur  = 1.0f;

        double Y = 0, R = 0, B = 0;

        for (size_t i=0; i<nCoeffs; ++i) {
            Y += coeffs[0][i] * cosPhi_cur;
            R += coeffs[1][i] * cosPhi_cur;
            B += coeffs[2][i] * cosPhi_cur;

            double cosPhi_next = 2*cosPhi*cosPhi_cur - cosPhi_prev;
            cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next;
        }

        double G = 1.39829f*Y -0.100913f*B - 0.297375f*R;

        return Color3((Float) R, (Float) G, (Float) B);
    #else
        double cosPhi = std::cos((double) phi);

        __m256d
            cosPhi_prev = _mm256_set1_pd(cosPhi),
            cosPhi_cur  = _mm256_set1_pd(1.0),
            Y           = _mm256_set_sd((double) coeffs[0][0]),
            R           = _mm256_set_sd((double) coeffs[1][0]),
            B           = _mm256_set_sd((double) coeffs[2][0]),
            factorPhi_prev, factorPhi_cur;

        initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur);

        for (size_t i=1; i<nCoeffs; i+=4) {
            __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev),
                    _mm256_mul_pd(factorPhi_cur,  cosPhi_cur));

            Y = _mm256_add_pd(Y, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[0]+i))));
            R = _mm256_add_pd(R, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[1]+i))));
            B = _mm256_add_pd(B, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[2]+i))));

            cosPhi_prev = _mm256_splat2_pd(cosPhi_next);
            cosPhi_cur = _mm256_splat3_pd(cosPhi_next);
        }

        MM_ALIGN32 struct {
            double Y;
            double R;
            double B;
            double unused;
        } tmp;

        simd::hadd(Y, R, B, _mm256_setzero_pd(), (double *) &tmp);

        double G = 1.39829*tmp.Y -0.100913*tmp.B - 0.297375*tmp.R;

        return Color3((Float) tmp.R, (Float) G, (Float) tmp.B);
    #endif
}
Ejemplo n.º 3
0
//for 20 depth
void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 20; d++) {
      vol_t* f = l->filters[d];    
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 8; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 8; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20]));
                  __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20]));
                  __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                  sum =_mm256_add_pd (vectorMult, sum);
                  __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4]));
                  __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4]));
                  __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9);
                  sum =_mm256_add_pd (vectorMult0, sum);
                  __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8]));
                  __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8]));
                  __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4);
                  sum =_mm256_add_pd (vectorMult2, sum);
                  __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12]));
                  __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12]));
                  __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6);
                  sum =_mm256_add_pd (vectorMult3, sum);
                  __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16]));
                  __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16]));
                  __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8);
                  sum =_mm256_add_pd (vectorMult4, sum);
              }
            }
          }
          for(int i = 0; i < 4; i++) {
                  a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Ejemplo n.º 4
0
void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
  __m256d YMM0, YMM1;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_pd(x+i);
    YMM1 = _mm256_loadu_pd(x+i+4);
    YMM0 = _mm256_add_pd(YMM0, YMM15);
    YMM1 = _mm256_add_pd(YMM1, YMM15);
    _mm256_storeu_pd(y+i, YMM0);
    _mm256_storeu_pd(y+i+4, YMM1);
  }
  for (; i<(n); i++) {
    y[i] = x[i] + c;
  }
}
Ejemplo n.º 5
0
int main(int, char**)
{
    volatile __m256d a = _mm256_setzero_pd();
    volatile __m256d b = _mm256_set1_pd(42.42);
    volatile __m256d result = _mm256_add_pd(a, b);
    (void)result;
    return 0;
}
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c)
{
  __m256d a_coli, bi0, bi1, bi2, bi3;
  __m256d c_col0, c_col1, c_col2, c_col3;

  /* layout of 4x4 c matrix
      00 01 02 03
      10 11 12 13
      20 21 22 23
      30 31 32 33
  */
  double* c01_ptr = c + lda;
  double* c02_ptr = c01_ptr + lda;
  double* c03_ptr = c02_ptr + lda;

  // load old value of c
  c_col0 = _mm256_loadu_pd(c);
  c_col1 = _mm256_loadu_pd(c01_ptr);
  c_col2 = _mm256_loadu_pd(c02_ptr);
  c_col3 = _mm256_loadu_pd(c03_ptr);

  // for every column of a (or every row of b)
  for (int i = 0; i < K; ++i) 
  {
    a_coli = _mm256_load_pd(a);
    a += 4;

    bi0 = _mm256_broadcast_sd(b++);
    bi1 = _mm256_broadcast_sd(b++);
    bi2 = _mm256_broadcast_sd(b++);
    bi3 = _mm256_broadcast_sd(b++);

    c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0));
    c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1));
    c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2));
    c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3));
  }

  _mm256_storeu_pd(c, c_col0);
  _mm256_storeu_pd(c01_ptr, c_col1);
  _mm256_storeu_pd(c02_ptr, c_col2);
  _mm256_storeu_pd(c03_ptr, c_col3);
}
Ejemplo n.º 7
0
inline
void kernel(adouble* v1, adouble * v2, int m)
{
	__m256d alpha = _mm256_set1_pd(0.25);
	//
	__m256d phi_e = _mm256_loadu_pd (v1 + 1 );
	__m256d phi_w = _mm256_loadu_pd (v1 - 1 );
	__m256d phi_n = _mm256_loadu_pd (v1 + m);
	__m256d phi_s = _mm256_loadu_pd (v1 - m);
	//
	phi_e = _mm256_add_pd(phi_e, phi_s);
	phi_e = _mm256_add_pd(phi_e, phi_n);
	//phi_e = _mm_fmadd_pd(alpha, phi_e, phi_w);
	phi_e = _mm256_add_pd(phi_e, phi_w);
	phi_e = _mm256_mul_pd(alpha, phi_e);
	//
	//printf("-> p = %p\n", &v2[0]);
	_mm256_stream_pd(v2, phi_e);

}
Ejemplo n.º 8
0
 double hadd(const vector4d& rhs)
 {
     // rhs = (x0, x1, x2, x3)
     // tmp = (x2, x3, x0, x1)
     __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
     // tmp = (x2+x0, x3+x1, -, -)
     tmp = _mm256_add_pd(rhs, tmp);
     // tmp = (x2+x0+x3+x1, -, -, -)
     tmp = _mm256_hadd_pd(tmp, tmp);
     return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
 }
Ejemplo n.º 9
0
		inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v)
		{
			float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y
			float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w

			float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x 
			float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y
			float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z
			float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w

			float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0);
			float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1);
			float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2);
			float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3);

			float64x4_t add0 = _mm256_add_pd(mul0, mul1);
			float64x4_t add1 = _mm256_add_pd(mul2, mul3);
			float64x4_t add2 = _mm256_add_pd(add0, add1);

			return add2;
		}
Ejemplo n.º 10
0
void sum_avx(double* c, double* a, double* b, int len)
{
    __m256d rA_AVX, rB_AVX, rC_AVX;   // variables for AVX

    for (int i = 0; i < len; i += 4)
    {
        rA_AVX = _mm256_load_pd(&a[i]);
        rB_AVX = _mm256_load_pd(&b[i]);
        rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX);
        _mm256_store_pd(&c[i], rC_AVX);
    }
}
Ejemplo n.º 11
0
ALGEBRA_INLINE double	vector_ps_double (const double* pa,const double* pb,size_t n) {
    if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
        size_t q = n/4;
        size_t r = n%4;
        double w = 0;

        if(q>0) {
            __m256d acc = _mm256_setzero_pd();
            __m256d i1 = _mm256_load_pd(pa);
            __m256d j1 = _mm256_load_pd(pb);
            pa += 4;
            pb += 4;
            __m256d s = _mm256_mul_pd(i1, j1);
            acc = _mm256_add_pd(acc, s);

            while(--q != 0) {
                // load
                i1 = _mm256_load_pd(pa);
                j1 = _mm256_load_pd(pb);
                pa += 4;
                pb += 4;
                // multiplie
                s = _mm256_mul_pd(i1, j1);
                // accumule
                acc = _mm256_add_pd(acc, s);            
            }
            // sum finale
            // add horizontal
            acc = _mm256_hadd_pd(acc, acc);
            // échange 128bits haut et bas
            __m256d accp = _mm256_permute2f128_pd(acc, acc, 1);
            // add vertical
            acc = _mm256_add_pd(acc, accp);
            // extract
            _mm_store_sd(&w,  _mm256_extractf128_pd(acc,0));
        }
        return w + vector_ps_double_basic(pa, pb, r);
    }
    return vector_ps_double_basic(pa, pb, n);
}
Ejemplo n.º 12
0
Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) {
    #if FOURIER_SCALAR == 1
        double cosPhi      = std::cos((double) phi),
               cosPhi_prev = cosPhi,
               cosPhi_cur  = 1.0,
               value       = 0.0;

        for (size_t i=0; i<nCoeffs; ++i) {
            value += coeffs[i] * cosPhi_cur;

            double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev;
            cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next;
        }

        return (Float) value;
    #else
        double cosPhi = std::cos((double) phi);

        __m256d
            cosPhi_prev = _mm256_set1_pd(cosPhi),
            cosPhi_cur  = _mm256_set1_pd(1.0),
            value       = _mm256_set_sd((double) coeffs[0]),
            factorPhi_prev, factorPhi_cur;

        initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur);

        for (size_t i=1; i<nCoeffs; i+=4) {
            __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i));

            __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev),
                    _mm256_mul_pd(factorPhi_cur,  cosPhi_cur));
            value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff));
            cosPhi_prev = _mm256_splat2_pd(cosPhi_next);
            cosPhi_cur = _mm256_splat3_pd(cosPhi_next);
        }

        return (Float) simd::hadd(value);
    #endif
}
Ejemplo n.º 13
0
 inline vector4d haddp(const vector4d* row)
 {
     // row = (a,b,c,d)
     // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
     __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
     // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
     __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
     // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
     __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
     // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
     tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
     return _mm256_add_pd(tmp1, tmp2);
 }
Ejemplo n.º 14
0
double compute_pi_leibniz_avx(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;

	ymm0 = _mm256_setzero_pd();
	ymm1 = _mm256_set1_pd(2.0);
	ymm2 = _mm256_set1_pd(1.0);
	ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	
	for (int i = 0; i <= n - 4; i += 4) {
		ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
		ymm4 = _mm256_mul_pd(ymm4, ymm1);
		ymm4 = _mm256_add_pd(ymm4, ymm2);
		ymm4 = _mm256_div_pd(ymm3, ymm4);
		ymm0 = _mm256_add_pd(ymm0, ymm4);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm0);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}
// this function assumes data is stored in col-major
// if data is in row major, call it like matmul4x4(B, A, C)
void matmul4x4(double *A, double *B, double *C) {
    __m256d col[4], sum[4];
    //load every column into registers
    for(int i=0; i<4; i++)  
      col[i] = _mm256_load_pd(&A[i*4]);
    for(int i=0; i<4; i++) {
        sum[i] = _mm256_setzero_pd();      
        for(int j=0; j<4; j++) {
            sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]);
        }           
    }
    for(int i=0; i<4; i++) 
      _mm256_store_pd(&C[i*4], sum[i]); 
}
Ejemplo n.º 16
0
void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
  __m256d YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-4); i+=4) {
    YMM0 = _mm256_loadu_pd(y+i);
    YMM1 = _mm256_loadu_pd(x+i);
    YMM2 = _mm256_mul_pd(YMM0, YMM15);
    YMM3 = _mm256_add_pd(YMM1, YMM2);
    _mm256_storeu_pd(z+i, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] + y[i] * c;
  }
}
Ejemplo n.º 17
0
void jacobi_avx(GRID_T *oldGrid, GRID_T *newGrid, int width, int height){
	int remainder;

	remainder = (width-2)%4;
	/* Each vector contains one value of the four Jacobi iteration step
	 * Either each upper, below, left or right value. */
	__m256d up_row, below_row, right_row, left_row;

	__m256d factor = _mm256_set1_pd(0.25);

	for(int i = 1; i < height-1; i++){
		for(int j = 1; j < width-4; j += 4){
			up_row = _mm256_loadu_pd(&(oldGrid[(i-1)*width + j]));
			below_row = _mm256_loadu_pd(&(oldGrid[(i+1)*width + j]));

			right_row = _mm256_loadu_pd(&(oldGrid[i*width + (j+1)]));
			left_row = _mm256_loadu_pd(&(oldGrid[i*width + (j-1)]));


			/* Sum up n-th element of each vector */
			__m256d dest;
			__m256d add_1 =  _mm256_add_pd(up_row, below_row);
			__m256d add_2 =  _mm256_add_pd(left_row, right_row);
			dest =  _mm256_add_pd(add_2, add_1);
			/* Multiplicat with 0.25 */
			dest = _mm256_mul_pd(dest, factor);

			// Use unaligned store method. Normal one produces segmentation fault
			_mm256_storeu_pd(&(newGrid[i*width + j]), dest);
		}
		for(int j = width - remainder - 1; j < width -1; j++){
			newGrid[i*width + j] = (oldGrid[i*width + (j-1)] + oldGrid[i*width + (j+1)] + oldGrid[(i-1)*width + j] + oldGrid[(i+1)*width + j]) * 0.25;
		}
	}
	return;
}
Ejemplo n.º 18
0
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}
Ejemplo n.º 19
0
double compute_pi_leibniz_avx_opt(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
	register __m256d ymm9, ymm10, ymm11, ymm12, ymm13;

	ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0);
	ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0);
	ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0);
	ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0);
	ymm13 = _mm256_set1_pd(32.0);

	ymm5 = _mm256_setzero_pd();
	ymm6 = _mm256_setzero_pd();
	ymm7 = _mm256_setzero_pd();
	ymm8 = _mm256_setzero_pd();
	
	for (int i = 0; i <= n - 16; i += 16) {
		ymm9 = _mm256_div_pd(ymm0, ymm1);
		ymm1 = _mm256_add_pd(ymm1, ymm13);
		ymm10 = _mm256_div_pd(ymm0, ymm2);
		ymm2 = _mm256_add_pd(ymm2, ymm13);
		ymm11 = _mm256_div_pd(ymm0, ymm3);
		ymm3 = _mm256_add_pd(ymm3, ymm13);
		ymm12 = _mm256_div_pd(ymm0, ymm4);
		ymm4 = _mm256_add_pd(ymm4, ymm13);

		ymm5 = _mm256_add_pd(ymm5, ymm9);
		ymm6 = _mm256_add_pd(ymm6, ymm10);
		ymm7 = _mm256_add_pd(ymm7, ymm11);
		ymm8 = _mm256_add_pd(ymm8, ymm12);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm5);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm6);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm7);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm8);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}
Ejemplo n.º 20
0
void SimpleClean::PartialSubtractImageAVX(double *image, size_t imgWidth, size_t imgHeight, const double *psf, size_t psfWidth, size_t psfHeight, size_t x, size_t y, double factor, size_t startY, size_t endY)
{
	size_t startX, endX;
	int offsetX = (int) x - psfWidth/2, offsetY = (int) y - psfHeight/2;
	
	if(offsetX > 0)
		startX = offsetX;
	else
		startX = 0;
	
	if(offsetY > (int) startY)
		startY = offsetY;
	
	endX = std::min(x + psfWidth/2, imgWidth);
	
	size_t unAlignedCount = (endX - startX) % 4;
	endX -= unAlignedCount;
	
	endY = std::min(y + psfHeight/2, endY);
	
	const __m256d mFactor = _mm256_set1_pd(-factor);
	for(size_t ypos = startY; ypos < endY; ++ypos)
	{
		double *imageIter = image + ypos * imgWidth + startX;
		const double *psfIter = psf + (ypos - offsetY) * psfWidth + startX - offsetX;
		for(size_t xpos = startX; xpos != endX; xpos+=4)
		{
			__m256d
				imgVal = _mm256_loadu_pd(imageIter),
				psfVal = _mm256_loadu_pd(psfIter);
#ifdef __FMA4__
			_mm256_storeu_pd(imageIter, _mm256_fmadd_pd(psfVal, mFactor, imgVal));
#else
			_mm256_storeu_pd(imageIter, _mm256_add_pd(imgVal, _mm256_mul_pd(psfVal, mFactor)));
#endif
			imageIter+=4;
			psfIter+=4;
		}
		for(size_t xpos = endX; xpos!=endX + unAlignedCount; ++xpos)
		{
			*imageIter -= *psfIter * factor;
			++imageIter;
			++psfIter;
		}
	}
}
Ejemplo n.º 21
0
irreg_poly_area_func_sign(double, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256d
        curr,
        forw,
        coef_0,
        coef_1,
        end = _mm256_load_pd((const double *)cords),
        accum_sum = _mm256_setzero_pd();
    double accum_sum_aux;

    unsigned long index;
    for (index = 0; index < (cords_len - 4); index += 4) {
        curr = end;                                                 // x0,y0,x1,y1
        forw = _mm256_load_pd((const double *)&cords[index + 2]);   // x2,y2,x3,y3
        end = _mm256_load_pd((const double *)&cords[index + 4]);    // x4,y4,x5,y5

        coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3
        coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4

        //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3
        accum_sum = _mm256_add_pd(
            accum_sum,
            _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4
                _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3
                    _mm256_permute2f128_pd(curr, forw, 0b00100000),  // x0, y0, x2, y2
                    _mm256_shuffle_pd(coef_0, coef_0, 0b0101)  // y1, x1, y3, x3
                ),
                _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4
                // ^^^^^^^^^^^^^^^  x1*y2, y1*x2, x3*y4, y3*x4
            )
        );
    }

    accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1
    accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ...
    for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Ejemplo n.º 22
0
    div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm0 = _mm256_movedup_pd(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
#else
        __m256d t1   = _mm256_mul_pd(lhs.value, ymm0);
        __m256d t2   = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
        __m256d ymm5 = _mm256_addsub_pd(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
#else
        __m256d t3   = _mm256_mul_pd(ymm0, ymm0);
        ymm0         = _mm256_add_pd(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_pd(ymm5, ymm0);
    }
Ejemplo n.º 23
0
double compute_pi_euler_avx(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3;
	ymm0 = _mm256_setzero_pd();
    ymm1 = _mm256_set1_pd(1.0);
    ymm2 = _mm256_set1_pd(6.0);

    for (int i = 0; i <= n - 4; i += 4) {
        ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);  
        ymm0 = _mm256_add_pd(ymm0, ymm3);
    }
    ymm3 = _mm256_mul_pd(ymm2, ymm0);
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm0);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return sqrt( pi );
}
inline
void addGrids(
    complexd dst[]
  , const complexd srcs[]
  , int nthreads
  , int grid_pitch
  , int grid_size
  )
{
  int siz = grid_size*grid_pitch;
#pragma omp parallel for
  for (unsigned int i = 0; i < siz*sizeof(complexd)/(256/8); i++) {
    __m256d sum = as256pc(srcs)[i];
    // __m256d sum = _mm256_loadu_pd(reinterpret_cast<const double*>(as256pc(srcs)+i));

    for (int g = 1; g < nthreads; g ++)
      sum = _mm256_add_pd(sum, as256pc(srcs + g * siz)[i]);

    as256p(dst)[i] = sum;
  }
}
Ejemplo n.º 25
0
/* sum double vectors ----------------------------------------------------------
* sum double vectors: out=data1.+data2
* args   : double *data1    I   input double array
*          double *data2    I   input double array
*          int    n         I   number of input data
*          double *out      O   output double array
* return : none
* note   : AVX command is used if "AVX" is defined
*-----------------------------------------------------------------------------*/
extern void sumvd(const double *data1, const double *data2, int n, double *out)
{
    int i;
#if !defined(AVX_ENABLE)
    for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
#else
    int m=n/4;
    __m256d xmm1,xmm2,xmm3;

    if (n<8) {
        for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
    }
    else {
        for (i=0;i<4*m;i+=4) {
            xmm1=_mm256_loadu_pd(&data1[i]);
            xmm2=_mm256_loadu_pd(&data2[i]);
            xmm3=_mm256_add_pd(xmm1,xmm2);
            _mm256_storeu_pd(&out[i],xmm3);
        }
        for (;i<n;i++)  out[i]=data1[i]+data2[i];
    }
#endif
}
Ejemplo n.º 26
0
void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 16; d++) {
      vol_t* f = l->filters[d];
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 32; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 32; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3]));
                __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3]));
                __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                sum =_mm256_add_pd (vectorMult, sum);
              }
            }
          }
          for(int i = 0; i < 3; i++) {
            a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Ejemplo n.º 27
0
void ks_tanh_int_d8x4(
    int    k,
    int    rhs,
    double *h,  // NOP
    double *u,
    double *aa, // NOP
    double *a,
    double *bb, // NOP
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i, rhs_left;
  double scal = ker->scal;
  double cons = ker->cons;


  v4df_t    c03_0,    c03_1,    c03_2,    c03_3;
  v4df_t    c47_0,    c47_1,    c47_2,    c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t u03, u47;
  v4df_t a03, a47, A03, A47; // prefetched A 
  v4df_t b0, b1, b2, b3, B0; // prefetched B
  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;


  // Rank-k update segment
  #include "ks_rank_k_int_d8x4.h"


  // Accumulate
  if ( aux->pc ) {
    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
  }


  // Scale before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &scal );
  c03_0.v  = _mm256_mul_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( c_tmp.v, c47_3.v );


  // Shift before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &cons );
  c03_0.v  = _mm256_add_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_add_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( c_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );


  // Prefetch u and w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u + 8 ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  // c = tanh( c );
  c03_0.v  = _mm256_tanh_pd( c03_0.v );
  c03_1.v  = _mm256_tanh_pd( c03_1.v );
  c03_2.v  = _mm256_tanh_pd( c03_2.v );
  c03_3.v  = _mm256_tanh_pd( c03_3.v );
  c47_0.v  = _mm256_tanh_pd( c47_0.v );
  c47_1.v  = _mm256_tanh_pd( c47_1.v );
  c47_2.v  = _mm256_tanh_pd( c47_2.v );
  c47_3.v  = _mm256_tanh_pd( c47_3.v );
  
  
  // Multiple rhs kernel summation.
  #include "ks_kernel_summation_int_d8x4.h"

}
Ejemplo n.º 28
0
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs,
        Float sample, Float &pdf, Float &phi) {
    bool flip = false;
    if (sample < 0.5f) {
        sample *= 2.0f;
    } else {
        sample = 1.0f - 2.0f * (sample - 0.5f);
        flip = true;
    }

    int iterations = 0;

    double a = 0.0,
           c = math::Pi_d,
           coeff0 = coeffs[0][0],
           y = coeff0*math::Pi_d*sample,
           deriv = 0.0,
           b = 0.5 * math::Pi_d,
           cosB = 0,
           sinB = 1;

    if (nCoeffs > 10 && sample != 0 && sample != 1) {
        float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2]));
        if (std::isfinite(stddev)) {
            b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev);
            cosB = std::cos(b);
            sinB = std::sqrt(1 - cosB * cosB);
        }
    }

    #if FOURIER_SCALAR != 1
        __m256d factorB_prev, factorB_cur;
    #endif

    while (true) {
        #if FOURIER_SCALAR == 1
            double cosB_prev = cosB,
                   sinB_prev = -sinB,
                   sinB_cur  = 0.0,
                   cosB_cur  = 1.0,
                   value     = coeff0 * b;

            deriv = coeff0;

            for (size_t j=1; j<nCoeffs; ++j) {
                double sinB_next = 2.0*cosB*sinB_cur - sinB_prev,
                       cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                       coeff     = (double) coeffs[0][j];

                value += coeff * recip[j] * sinB_next;
                deriv += coeff * cosB_next;

                sinB_prev = sinB_cur; sinB_cur = sinB_next;
                cosB_prev = cosB_cur; cosB_cur = cosB_next;
            }
        #else
            initializeRecurrence(cosB, factorB_prev, factorB_cur);

            __m256d
                sinB_prev  = _mm256_set1_pd(-sinB),
                sinB_cur   = _mm256_set1_pd(0.0),
                cosB_prev  = _mm256_set1_pd(cosB),
                cosB_cur   = _mm256_set1_pd(1.0),
                value_vec  = _mm256_set_sd(coeff0 * b),
                deriv_vec  = _mm256_set_sd(coeff0);

            for (size_t j=1; j<nCoeffs; j+=4) {
                __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j);
                __m256d recip_vec  = _mm256_load_pd(recip+j);
                __m256d coeff_vec  = _mm256_cvtps_pd(coeff_vec_f);

                __m256d sinB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, sinB_prev),
                        _mm256_mul_pd(factorB_cur, sinB_cur));

                __m256d cosB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, cosB_prev),
                        _mm256_mul_pd(factorB_cur, cosB_cur));

                value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd(
                    _mm256_mul_pd(recip_vec, coeff_vec), sinB_next));
                deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next));

                sinB_prev = _mm256_splat2_pd(sinB_next);
                cosB_prev = _mm256_splat2_pd(cosB_next);
                sinB_cur  = _mm256_splat3_pd(sinB_next);
                cosB_cur  = _mm256_splat3_pd(cosB_next);
            }

            double value = simd::hadd(value_vec);
            deriv = simd::hadd(deriv_vec);
        #endif

        value -= y;

        if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20)
            break;
        else if (value > 0.0)
            c = b;
        else
            a = b;

        b -= value / deriv;

        if (!(b >= a && b <= c))
            b = 0.5f * (a + c);

        cosB = std::cos(b);
        sinB = std::sqrt(1-cosB*cosB);
    }

    double Y = deriv;
    if (flip)
        b = 2.0*math::Pi_d - b;

    pdf = (Float) (math::InvTwoPi_d * Y / coeff0);
    phi = (Float) b;

    #if FOURIER_SCALAR == 1
        double cosB_prev = cosB,
               cosB_cur  = 1.0;

        double R = coeffs[1][0];
        double B = coeffs[2][0];

        for (size_t j=1; j<nCoeffs; ++j) {
            double cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                   coeffR    = (double) coeffs[1][j],
                   coeffB    = (double) coeffs[2][j];

            R += coeffR * cosB_next;
            B += coeffB * cosB_next;

            cosB_prev = cosB_cur; cosB_cur = cosB_next;
        }
    #else
        __m256d
            cosB_prev  = _mm256_set1_pd(cosB),
            cosB_cur   = _mm256_set1_pd(1.0),
            R_vec  = _mm256_set_sd(coeffs[1][0]),
            B_vec  = _mm256_set_sd(coeffs[2][0]);

        for (size_t j=1; j<nCoeffs; j+=4) {
            __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j);
            __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j);
            __m256d coeff_R_vec  = _mm256_cvtps_pd(coeff_R_vec_f);
            __m256d coeff_B_vec  = _mm256_cvtps_pd(coeff_B_vec_f);

            __m256d cosB_next = _mm256_add_pd(
                    _mm256_mul_pd(factorB_prev, cosB_prev),
                    _mm256_mul_pd(factorB_cur, cosB_cur));

            R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next));
            B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next));

            cosB_prev = _mm256_splat2_pd(cosB_next);
            cosB_cur  = _mm256_splat3_pd(cosB_next);
        }

        double R = simd::hadd(R_vec);
        double B = simd::hadd(B_vec);
    #endif

    double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R;
    return Color3((Float) R, (Float) G, (Float) B)
        * (2 * math::Pi) * (Float) (coeff0 / Y);
}
Ejemplo n.º 29
0
int simd_chol(double *A, int n){

    register  int i;
    register  int j;
    register  int k;
    register  int local_size = n;
    register __m256d v1, v2, v3, v4, mul1, mul2, sum;

    for (j = 0; j < local_size; j++) {
        for (i = j; i < local_size; i++) {
            register double Aij = A[IDX(i, j, local_size)];
            if (j > 8)
                for (k = 0; k < j;) {
                    if (k < j - 8){

                        v1 = _mm256_loadu_pd(A+IDX(i, k, local_size));
                        v2 = _mm256_loadu_pd(A+IDX(j, k, local_size));
                        mul1 = _mm256_mul_pd(v1, v2);

                        v3 = _mm256_loadu_pd(A+IDX(i, k + 4, local_size));
                        v4 = _mm256_loadu_pd(A+IDX(j, k + 4, local_size));

                        mul2 = _mm256_mul_pd(v3, v4);

                        sum = _mm256_add_pd(mul1, mul2);
                        Aij -= (sum[3] + sum[2] + sum[1] + sum[0]);

                        k = k + 8;
                    } else {
                        Aij -= A[IDX(i, k, local_size)] * A[IDX(j, k, local_size)];
                        k++;
                    }
                }
                // i <= 8
            else for (k = 0; k < j; ++k)
                    Aij -= A[IDX(i, k, local_size)] * A[IDX(j, k, local_size)];
            A[IDX(i, j, local_size)] = Aij;
        }

        if (A[IDX(j, j, local_size)] < 0.0) {
            return (1);
        }

        A[IDX(j, j, local_size)] = sqrt(A[IDX(j, j, local_size)]);
        register double Ajj = A[IDX(j, j, n)];
        for (i = j + 1; i < local_size;){

            if (i < local_size - 8){
                A[IDX(i, j, local_size)] /= Ajj;
                A[IDX(i + 1, j, local_size)] /= Ajj;
                A[IDX(i + 2, j, local_size)] /= Ajj;
                A[IDX(i + 3, j, local_size)] /= Ajj;
                A[IDX(i + 4, j, local_size)] /= Ajj;
                A[IDX(i + 5, j, local_size)] /= Ajj;
                A[IDX(i + 6, j, local_size)] /= Ajj;
                A[IDX(i + 7, j, local_size)] /= Ajj;
                i += 8;
            } else {
                A[IDX(i, j, local_size)] /= Ajj;
                i++;
            }
        }
    }

    return (0);
}
Ejemplo n.º 30
0
/*
Naive implementation of Matrix Matrix Multiplication

@param A input matrix
@param B input matrix
@param C output matrix
*/
inline
void	naive(const Matrix& A, const Matrix& B, Matrix& C){
	//preload dimensions for faster access
	int dimM = C.getDimM();
	int dimN = C.getDimN();
	int dimL = A.getDimN();
	
	for (int m = 0; m < dimM; m+=4){				///rows of c
		for (int n = 0; n < dimN; n+=4){			///cols of c	
			//do calculation of a 4x4 block
			//std::cout << m << "\t" << n << std::endl;
			__m256d*	pA = A.get(m, 0);
			__m256d*	pB = A.get(m+1, 0);
			__m256d*	pC = A.get(m+2, 0);
			__m256d*	pD = A.get(m+3, 0);
			__m256d*	pK = B.getT(0, n);
			__m256d*	pL = B.getT(0, n+1);
			__m256d*	pM = B.getT(0, n+2);
			__m256d*	pN = B.getT(0, n+3);
			//std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl;
			__m256d		K = _mm256_setzero_pd();
			__m256d		L = _mm256_setzero_pd();
			__m256d		M = _mm256_setzero_pd();
			__m256d		N = _mm256_setzero_pd();
			__m256d		O = _mm256_setzero_pd();
			__m256d		P = _mm256_setzero_pd();
			__m256d		Q = _mm256_setzero_pd();
			__m256d		R = _mm256_setzero_pd();
			__m256d		S = _mm256_setzero_pd();
			__m256d		T = _mm256_setzero_pd();
			__m256d		U = _mm256_setzero_pd();
			__m256d		V = _mm256_setzero_pd();
			__m256d		W = _mm256_setzero_pd();
			__m256d		X = _mm256_setzero_pd();
			__m256d		Y = _mm256_setzero_pd();
			__m256d		Z = _mm256_setzero_pd();
			for (int l = 0; l < dimL; l+=4){
				//std::cout <<"mul" << std::endl;
				K = K + (*pA) * (*pK);
				L = L + (*pA) * (*pL);
				M = M + (*pA) * (*pM);
				N = N + (*pA) * (*pN);
				O = O + (*pB) * (*pK);
				P = P + (*pB) * (*pL);
				Q = Q + (*pB) * (*pM);
				R = R + (*pB) * (*pN);
				S = S + (*pC) * (*pK);
				T = T + (*pC) * (*pL);
				U = U + (*pC) * (*pM);
				V = V + (*pC) * (*pN);
				W = W + (*pD) * (*pK);
				X = X + (*pD) * (*pL);
				Y = Y + (*pD) * (*pM);
				Z = Z + (*pD) * (*pN);
				//std::cout << "inc" <<std::endl;
				pA++;
				pB++;
				pC++;
				pD++;
				pK++;
				pL++;
				pM++;
				pN++;
			}
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			__m256d sumab = _mm256_hadd_pd(K, L);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			__m256d sumcd = _mm256_hadd_pd(M, N);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			__m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			__m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			__m256d sum =  _mm256_add_pd(perm, blend);

			C.set(m, n, sum);
			//C(m  , n)     = K[0] + K[1] + K[2] + K[3];
			//C(m  , n+1)   = L[0] + L[1] + L[2] + L[3];
			//C(m  , n+2)   = M[0] + M[1] + M[2] + M[3];
			//C(m  , n+3)   = N[0] + N[1] + N[2] + N[3];

			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(O, P);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Q, R);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+1, n, sum);
			//C(m+1, n  )   = O[0] + O[1] + O[2] + O[3];
			//C(m+1, n+1)   = P[0] + P[1] + P[2] + P[3];
			//C(m+1, n+2)   = Q[0] + Q[1] + Q[2] + Q[3];
			//C(m+1, n+3)   = R[0] + R[1] + R[2] + R[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(S, T);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(U, V);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+2, n, sum);
			//C(m+2, n  )   = S[0] + S[1] + S[2] + S[3];
			//C(m+2, n+1)   = T[0] + T[1] + T[2] + T[3];
			//C(m+2, n+2)   = U[0] + U[1] + U[2] + U[3];
			//C(m+2, n+3)   = V[0] + V[1] + V[2] + V[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(W, X);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Y, Z);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+3, n, sum);
			
			//C(m+3, n  )   = W[0] + W[1] + W[2] + W[3];
			//C(m+3, n+1)   = X[0] + X[1] + X[2] + X[3];
			//C(m+3, n+2)   = Y[0] + Y[1] + Y[2] + Y[3];
			//C(m+3, n+3)   = Z[0] + Z[1] + Z[2] + Z[3];
		}
	}
}