예제 #1
0
/*
 * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z
 * member variables of the st_coords structure arr.
 *
 * Traverse elements randomly
 */
void
comp_s(st_coords arr, int L)
{
  for(int j=0; j<L; j+=8) {
    int i = (rand() % (L/8)) * 8;
    __m256 x = _mm256_load_ps(&arr.x[i]);
    __m256 y = _mm256_load_ps(&arr.y[i]);
    __m256 z = _mm256_load_ps(&arr.z[i]);
    __m256 t = _mm256_load_ps(&arr.t[i]);
#ifdef FMA
    register __m256 s0;
    s0 = _mm256_mul_ps(x, x);
    s0 = _mm256_fmadd_ps(y, y, s0);
    s0 = _mm256_fmadd_ps(z, z, s0);
    s0 = _mm256_fmsub_ps(t, t, s0);
    s0 = _mm256_sqrt_ps(s0);
#else
    register __m256 s0, s1;
    s1 = _mm256_mul_ps(x, x);
    s0 = _mm256_mul_ps(y, y);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(z, z);
    s1 = _mm256_add_ps(s0, s1);
    s0 = _mm256_mul_ps(t, t);
    s1 = _mm256_sub_ps(s0, s1);
    s0 = _mm256_sqrt_ps(s1);
#endif
    
    _mm256_store_ps(&arr.s[i], s0);
  }  
  return;
}
예제 #2
0
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for (; i <= width - 16; i += 16)
    {
        __m256 f, s0 = d8, s1 = d8;
        __m256 x0;
        S = src[0] + i;

        for (k = 1; k <= ksize2; k++)
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for (; i <= width - 4; i += 4)
    {
        __m128 f, x0, s0 = d4;

        for (k = 1; k <= ksize2; k++)
        {
            f = _mm_set1_ps(ky[k]);
            x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}
예제 #3
0
    /*!
     * \copydoc avx_vec::fmadd
     */
    ETL_STATIC_INLINE(avx_simd_float) fmadd(avx_simd_float a, avx_simd_float b, avx_simd_float c) {
#ifdef __FMA__
        return _mm256_fmadd_ps(a.value, b.value, c.value);
#else
        return add(mul(a, b), c);
#endif
    }
예제 #4
0
inline void avx2_uv_to_xy_f(const __m256i u, const __m256i v, __m256& x, __m256& y)
{
  y = _mm256_cvtepi32_ps(v);
  x = _mm256_cvtepi32_ps(u);
  x = _mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x);
  y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy));
}
예제 #5
0
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v)
{
  // Convert X,Y first into U,V space then round to nearest
  // integer. That gets us close to correct answer, mapping XY to a
  // lozenge-shaped space rather than hexagonal. We then correct the
  // four regions that lie outside the hexagonal cell assigning them
  // to their correct neighboring cell.
  // Writer's note: see ~/Google Drive/Work/calin

  // double dv = y*c_vy_inv;
  // double du = x-dv*c_vx;
  // u = std::lround(du);
  // v = std::lround(dv);
  // du -= u;
  // dv -= v;

  y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv));
  x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x);
  u = _mm256_cvtps_epi32(x);
  v = _mm256_cvtps_epi32(y);
  x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u));
  y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v));

  // double c3 = dv-du;
  const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x));

  __m256i uvshift;
  __m256i mask;

  // double c1 = du+0.5*dv;
  // double c2 = dv+0.5*du;
  // if(c3<0) {
  //   if(c1>=1) u++;
  //   else if(c2<-1) v--;
  // } else {
  //   if(c2>=1) v++;
  //   else if(c1<-1) u--;
  // }

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask);

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask);
}
예제 #6
0
void polynomial(float *ret, const float *const r_values, int num) {
  // r*r*r*(10+r*(-15+r*6));

  __m256 const_6 = _mm256_set1_ps(6.0f);
  __m256 const_neg_15 = _mm256_set1_ps(-15.0f);
  __m256 const_10 = _mm256_set1_ps(10.0f);
  // constants

  const int loop_factor = 8;

  for (int i = 0; i < num; i+=loop_factor) {

#ifdef USE_IACA
  IACA_START
#endif
    __m256 r;
    __m256 left;
    __m256 right;
    // aligned load of 256 bits r
    r = _mm256_load_ps(&r_values[i]);
    left = _mm256_mul_ps(r, r); // r * r
#ifndef __FMA__

    right = _mm256_mul_ps(r, const_6); // r * 6
    left = _mm256_mul_ps(left, r); // r * r * r
    right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6
    right = _mm256_mul_ps(right, r); //r * (-15 + r * 6)
    right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6))

#else
    right = _mm256_fmadd_ps(r, const_6, const_neg_15);
    left = _mm256_mul_ps(left, r);

    right = _mm256_fmadd_ps(r, right, const_10);

#endif
    right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6))

    _mm256_store_ps(&ret[i], right); // store 8 values to ret[i]

  }
#ifdef USE_IACA
  IACA_END
#endif
}
예제 #7
0
void MaddMemcpy(float* arg1, float* arg2, float* arg3, int size1, int size2, float* result) {
    memcpy(arg2, arg1, size1);
    memcpy(arg3, arg1, size2);
    __m256 vec1 = _mm256_load_ps(arg1);
    __m256 vec2 = _mm256_load_ps(arg2);
    __m256 vec3 = _mm256_load_ps(arg3);
    __m256 res  = _mm256_fmadd_ps(vec1, vec2, vec3);
    _mm256_storeu_ps(result, res);
}
예제 #8
0
inline void avx2_uv_to_xy_trans_f(const __m256i u, const __m256i v, __m256& x, __m256& y,
  const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0)
{
  // uv_to_xy(u,v,x,y);
  // double xx = x*crot - y*srot;
  // y = scale * (y*crot + x*srot) + dy;
  // x = scale * xx + dx;
  avx2_uv_to_xy_f(u,v,x,y);
  const __m256 vsrot = _mm256_set1_ps(srot);
  const __m256 vcrot = _mm256_set1_ps(crot);
  const __m256 vscale = _mm256_set1_ps(scale);
  __m256 xx = _mm256_mul_ps(y, vsrot);
  xx = _mm256_fmsub_ps(x, vcrot, xx);
  y = _mm256_mul_ps(y, vcrot);
  y = _mm256_fmadd_ps(x, vsrot, y);
  x = xx;
  x = _mm256_fmadd_ps(x, vscale, _mm256_set1_ps(dx));
  y = _mm256_fmadd_ps(y, vscale, _mm256_set1_ps(dy));
}
예제 #9
0
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid)
{
  // The following algorithm works until hexid=12,589,056
  // const unsigned iarg = 1+4*(hexid-1)/3;
  // return (unsigned(std::sqrt(float(iarg)))+1)/2;
  __m256 arg = _mm256_cvtepi32_ps(hexid);
  arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds),
    calin::math::simd::c_m256(_c_m256_one_third));
  arg = _mm256_sqrt_ps(arg);
  arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half),
    calin::math::simd::c_m256(_c_m256_one_half));
  arg = _mm256_floor_ps(arg);
  return _mm256_cvtps_epi32(arg);
}
    __m256 _inner_mm256_exp_ps1(__m256 arg)
    {
        arg = _mm256_mul_ps(arg, _mm256_set1_ps(1.4426950408889634073599246810018921374266459541529859f));

        __m256i e = _mm256_add_epi32(
            _mm256_castps_si256(_mm256_cmp_ps(arg, _mm256_set1_ps(0.0f), _CMP_LT_OQ)),
            _mm256_cvttps_epi32(arg));

        arg = _mm256_sub_ps(arg, _mm256_cvtepi32_ps(e));

        __m256 intermediate_result;
        intermediate_result = _mm256_fmadd_ps(_mm256_set1_ps(0.0136779459179717f), arg, _mm256_set1_ps(0.0517692205767896f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.241554388295527f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.692998430056128f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.999999804292074f));
        arg = intermediate_result;

        __m256 res = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(e, _mm256_set1_epi32(127)), 23));

        res = _mm256_mul_ps(res, arg);

        return res;
    }
예제 #11
0
파일: fma.c 프로젝트: pzemtsov/MandelView
void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256 dd = _mm256_set1_ps ((float) scale);
    __m256 XX0 = _mm256_set1_ps ((float) X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0);
        for (unsigned i = 0; i < SX; i += 8)	{
            __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
            __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0);
            __m256 x = x0;
            __m256 y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256 x2 = _mm256_mul_ps (x, x);
                __m256 y2 = _mm256_mul_ps (y, y);
                __m256 abs = _mm256_add_ps (x2, y2);
                __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi32 (counts, cmp_mask);
                __m256 t = _mm256_add_ps (x, x);
                y = _mm256_fmadd_ps (t, y, y0);
                x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12));
            __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0);
            result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2));
            _mm_storel_epi64 ((__m128i*) out, result128);
            out += 8;
        }
    }
}
예제 #12
0
inline void avx2_xy_trans_to_uv_with_remainder_f(
  __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v,
  const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0)
{
  const __m256 vsrot = _mm256_set1_ps(srot);
  const __m256 vcrot = _mm256_set1_ps(crot);
  __m256 vscale = _mm256_set1_ps(1.0f/scale);
  x_in_dx_out = _mm256_mul_ps(_mm256_sub_ps(x_in_dx_out, _mm256_set1_ps(dx)), vscale);
  y_in_dy_out = _mm256_mul_ps(_mm256_sub_ps(y_in_dy_out, _mm256_set1_ps(dy)), vscale);
  __m256 yy = _mm256_mul_ps(x_in_dx_out, vsrot);
  yy = _mm256_fmsub_ps(y_in_dy_out, vcrot, yy);
  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot);
  x_in_dx_out = _mm256_fmadd_ps(y_in_dy_out, vsrot, x_in_dx_out);
  avx2_xy_to_uv_with_remainder_f(x_in_dx_out, yy, u, v);

  vscale = _mm256_set1_ps(scale);
  y_in_dy_out = _mm256_mul_ps(yy, vcrot);
  y_in_dy_out = _mm256_fmadd_ps(x_in_dx_out, vsrot, y_in_dy_out);
  y_in_dy_out = _mm256_mul_ps(y_in_dy_out, vscale);

  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot);
  x_in_dx_out = _mm256_fnmadd_ps(yy, vsrot, x_in_dx_out);
  x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vscale);
}
__attribute__((noinline)) float dot256fma(float *x1, float *x2, size_t len) {
  assert(len % 8 == 0);
  __m256 sum = _mm256_setzero_ps();
  if (len > 7) {
    size_t limit = len - 7;
    for (size_t i = 0; i < limit; i += 8) {
      __m256 v1 = _mm256_loadu_ps(x1 + i);
      __m256 v2 = _mm256_loadu_ps(x2 + i);
      sum = _mm256_fmadd_ps(v1, v2, sum);
    }
  }
  float buffer[8];
  _mm256_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7];
}
예제 #14
0
void
check_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C)
{
  union256 a, b, c, e;
  a.x = __A;
  b.x = __B;
  c.x = __C;
  float d[8];
  int i;
  e.x = _mm256_fmadd_ps (__A, __B, __C);
  for (i = 0; i < 8; i++)
    {
      d[i] = a.a[i] * b.a[i] + c.a[i];
    }
  if (check_union256 (e, d))
    abort ();
}
예제 #15
0
inline void avx2_xy_trans_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v,
  const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0)
{
  // x = (x - dx)/scale;
  // y = (y - dy)/scale;
  // double xx = x*crot + y*srot;
  // y = y*crot - x*srot;
  // xy_to_uv(xx,y,u,v);
  const __m256 vsrot = _mm256_set1_ps(srot);
  const __m256 vcrot = _mm256_set1_ps(crot);
  const __m256 vscale = _mm256_set1_ps(1.0f/scale);
  x = _mm256_mul_ps(_mm256_sub_ps(x, _mm256_set1_ps(dx)), vscale);
  y = _mm256_mul_ps(_mm256_sub_ps(y, _mm256_set1_ps(dy)), vscale);
  __m256 yy = _mm256_mul_ps(x, vsrot);
  yy = _mm256_fmsub_ps(y, vcrot, yy);
  x = _mm256_mul_ps(x, vcrot);
  x = _mm256_fmadd_ps(y, vsrot, x);
  avx2_xy_to_uv_f(x, yy, u, v);
}
예제 #16
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }
예제 #17
0
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize)
{
    int i = 0, k;
    for (; i <= width - 8; i += 8)
    {
        const float* src = src0 + i;
        __m256 f, x0;
        __m256 s0 = _mm256_set1_ps(0.0f);
        for (k = 0; k < _ksize; k++, src += cn)
        {
            f = _mm256_set1_ps(_kx[k]);
            x0 = _mm256_loadu_ps(src);
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
        }
        _mm256_storeu_ps(dst + i, s0);
    }
    _mm256_zeroupper();
    return i;
}
	template <> __m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
	{
		return _mm256_fmadd_ps(a, b, c);
	}
예제 #19
0
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
               size_t bstep, float* cptr, size_t cstep,
               int ma, int na, int nb )
{
    int n = 0;
    for( ; n <= nb - 16; n += 16 )
    {
        for( int m = 0; m < ma; m += 4 )
        {
            const float* aptr0 = aptr + astep*m;
            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);

            float* cptr0 = cptr + cstep*m;
            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);

            __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
            __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
            __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
            __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();

            for( int k = 0; k < na; k++ )
            {
                __m256 a0 = _mm256_set1_ps(aptr0[k]);
                __m256 a1 = _mm256_set1_ps(aptr1[k]);
                __m256 a2 = _mm256_set1_ps(aptr2[k]);
                __m256 a3 = _mm256_set1_ps(aptr3[k]);
                __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
                __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
                d00 = _mm256_fmadd_ps(a0, b0, d00);
                d01 = _mm256_fmadd_ps(a0, b1, d01);
                d10 = _mm256_fmadd_ps(a1, b0, d10);
                d11 = _mm256_fmadd_ps(a1, b1, d11);
                d20 = _mm256_fmadd_ps(a2, b0, d20);
                d21 = _mm256_fmadd_ps(a2, b1, d21);
                d30 = _mm256_fmadd_ps(a3, b0, d30);
                d31 = _mm256_fmadd_ps(a3, b1, d31);
            }

            _mm256_storeu_ps(cptr0 + n, d00);
            _mm256_storeu_ps(cptr0 + n + 8, d01);
            _mm256_storeu_ps(cptr1 + n, d10);
            _mm256_storeu_ps(cptr1 + n + 8, d11);
            _mm256_storeu_ps(cptr2 + n, d20);
            _mm256_storeu_ps(cptr2 + n + 8, d21);
            _mm256_storeu_ps(cptr3 + n, d30);
            _mm256_storeu_ps(cptr3 + n + 8, d31);
        }
    }

    for( ; n < nb; n++ )
    {
        for( int m = 0; m < ma; m++ )
        {
            const float* aptr0 = aptr + astep*m;
            float* cptr0 = cptr + cstep*m;
            float d0 = 0.f;

            for( int k = 0; k < na; k++ )
                d0 += aptr0[k]*bptr[k*bstep + n];

            cptr0[n] = d0;
        }
    }
    _mm256_zeroupper();
}
예제 #20
0
__m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) {
  // CHECK-LABEL: test_mm256_fmadd_ps
  // CHECK: @llvm.x86.fma.vfmadd.ps.256
  return _mm256_fmadd_ps(a, b, c);
}
예제 #21
0
void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_)
{
#ifdef __AVX2__
    const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ);
    const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ);

    const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2);

    const int N = q.N;
    const int N_n = q_n.N_pad;

    __m256 Epot8 = _mm256_setzero_ps();
    __m256 _1 = _mm256_set1_ps(1.f);
    __m256 _2 = _mm256_set1_ps(2.f);
    __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ);

    for(int i = 0; i < N; ++i)
    {
        const __m256 xi = _mm256_set1_ps(q.x[i]);
        const __m256 yi = _mm256_set1_ps(q.y[i]);
        const __m256 zi = _mm256_set1_ps(q.z[i]);

        __m256 f_xi = _mm256_setzero_ps();
        __m256 f_yi = _mm256_setzero_ps();
        __m256 f_zi = _mm256_setzero_ps();

        for(int j = 0; j < N_n; j += 8)
        {
            // count number of interactions
            if(MOLEC_CELLLIST_COUNT_INTERACTION)
                ++num_potential_interactions;

            // load coordinates and fores into AVX vectors
            const __m256 xj = _mm256_load_ps(&q_n.x[j]);
            const __m256 yj = _mm256_load_ps(&q_n.y[j]);
            const __m256 zj = _mm256_load_ps(&q_n.z[j]);

            __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]);
            __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]);
            __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]);


            // distance computation
            const __m256 xij = _mm256_sub_ps(xi, xj);
            const __m256 yij = _mm256_sub_ps(yi, yj);
            const __m256 zij = _mm256_sub_ps(zi, zj);


            const __m256 zij2 = _mm256_mul_ps(zij, zij);
            const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2));


            // r2 < Rcut2
            const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ);

            // if( any(r2 < R2) )
            if(_mm256_movemask_ps(mask))
            {
                const __m256 r2inv = _mm256_div_ps(_1, r2);

                const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv);
                const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2);
                const __m256 s12 = _mm256_mul_ps(s6, s6);

                const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6);
                const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6);

                Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask));

                const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6);
                const __m256 fr_mask = _mm256_and_ps(fr, mask);


                // update forces
                f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi);
                f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi);
                f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi);

                f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj);
                f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj);
                f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj);

                // store back j-forces
                _mm256_store_ps(&q_n.f_x[j], f_xj);
                _mm256_store_ps(&q_n.f_y[j], f_yj);
                _mm256_store_ps(&q_n.f_z[j], f_zj);
            }
        }

        // update i-forces
        float MOLEC_ALIGNAS(32) f_array[8];
        _mm256_store_ps(f_array, f_xi);
        q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
        _mm256_store_ps(f_array, f_yi);
        q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
        _mm256_store_ps(f_array, f_zi);
        q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5]
                    + f_array[6] + f_array[7];
    }

    float MOLEC_ALIGNAS(32) E_pot_array[8];
    _mm256_store_ps(E_pot_array, Epot8);

    // perform reduction of potential energy
    *Epot_ += 4
              * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2]
                                        + E_pot_array[3] + E_pot_array[4] + E_pot_array[5]
                                        + E_pot_array[6] + E_pot_array[7]);
#endif
}
예제 #22
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
예제 #23
0
void
calc_dnn_fma(float *dst, float *src, float *w, float *b, int out, int in, float *fstore)
{
#ifdef HAS_SIMD_FMA

  float *s;
  int i, j;
  int n = in / 8;

  for (i = 0; i + 3 < out; i += 4) {
    float *w2, *w3, *w4;
    __m256 x1 = _mm256_setzero_ps();
    __m256 x2 = _mm256_setzero_ps();
    __m256 x3 = _mm256_setzero_ps();
    __m256 x4 = _mm256_setzero_ps();
    w2 = w + in;
    w3 = w2 + in;
    w4 = w3 + in;
    s = src;
    for (j = 0; j < n; j++) {
      __m256 vs = _mm256_load_ps(s);
      __m256 vw1 = _mm256_load_ps(w);
      __m256 vw2 = _mm256_load_ps(w2);
      __m256 vw3 = _mm256_load_ps(w3);
      __m256 vw4 = _mm256_load_ps(w4);
      x1 = _mm256_fmadd_ps(vs, vw1, x1);
      x2 = _mm256_fmadd_ps(vs, vw2, x2);
      x3 = _mm256_fmadd_ps(vs, vw3, x3);
      x4 = _mm256_fmadd_ps(vs, vw4, x4);
      s  += 8;
      w  += 8;
      w2 += 8;
      w3 += 8;
      w4 += 8;
    }
    _mm256_store_ps(fstore, x1);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x2);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x3);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x4);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    w = w4;
  }

  /* process last <4 nodes */
  for (; i < out; i++) {
    __m256 x = _mm256_setzero_ps();
    s = src;
    for (j = 0; j < n; j++) {
      __m256 vs = _mm256_load_ps(s);
      __m256 v = _mm256_load_ps(w);
      x = _mm256_fmadd_ps(vs, v, x);
      s  += 8;
      w  += 8;
    }
    _mm256_store_ps(fstore, x);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
  }
  
#endif	/* HAS_SIMD_FMA */
}
예제 #24
0
파일: main.cpp 프로젝트: CCJY/coliru
matrix_t matrixMulFMA(const matrix_t & matrixA, const matrix_t & matrixB) {
	auto dimension = matrixA.size();

	assert(matrixA.size() == dimension);
	assert(matrixA[0].size() == dimension);
	assert(matrixB.size() == dimension);
	assert(matrixB[0].size() == dimension);

	matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix

	const int vec = 8;
	int start{0};

	if(dimension > vec) {
		start = dimension - dimension % vec;
		for(int x{0}; x < dimension; ++x)
		for(int i{0}; i < dimension; ++i) {
			const __m256 a = _mm256_set_ps(matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i]);// unaligned read
			for(int y{0}; y < dimension - vec; y += vec) {
				//__m256 c = _mm256_set_ps(matrixC[x][y+7], matrixC[x][y+6], matrixC[x][y+5], matrixC[x][y+4], matrixC[x][y+3], matrixC[x][y+2], matrixC[x][y+1], matrixC[x][y+0]);
				//const __m256 b = _mm256_set_ps(matrixB[i][y+7], matrixB[i][y+6], matrixB[i][y+5], matrixB[i][y+4], matrixB[i][y+3], matrixB[i][y+2], matrixB[i][y+1], matrixB[i][y+0]);

				__m256 c = *reinterpret_cast<__m256*>(&matrixC[x][y]);// aligned read
				const __m256 & b = *reinterpret_cast<const __m256*>(&matrixB[i][y]);// aligned read

				c = _mm256_fmadd_ps(a, b, c);//c = a * b + c;
				//_mm256_store_ps(&matrixC[x][y], c);//aligned
				//_mm256_storeu_ps(&matrixC[x][y], c);//unaligned

				/*
				float c[8];
				c[0] = matrixC[i][y+0];
				c[1] = matrixC[i][y+1];
				c[2] = matrixC[i][y+2];
				c[3] = matrixC[i][y+3];
				c[4] = matrixC[i][y+4];
				c[5] = matrixC[i][y+5];
				c[6] = matrixC[i][y+6];
				c[7] = matrixC[i][y+7];

				c[0] += matrixA[x][i] * matrixB[i][y+0];
				c[1] += matrixA[x][i] * matrixB[i][y+1];
				c[2] += matrixA[x][i] * matrixB[i][y+2];
				c[3] += matrixA[x][i] * matrixB[i][y+3];
				c[4] += matrixA[x][i] * matrixB[i][y+4];
				c[5] += matrixA[x][i] * matrixB[i][y+5];
				c[6] += matrixA[x][i] * matrixB[i][y+6];
				c[7] += matrixA[x][i] * matrixB[i][y+7];
				//*/
				//*
				matrixC[x][y+0] = c[0];
				matrixC[x][y+1] = c[1];
				matrixC[x][y+2] = c[2];
				matrixC[x][y+3] = c[3];
				matrixC[x][y+4] = c[4];
				matrixC[x][y+5] = c[5];
				matrixC[x][y+6] = c[6];
				matrixC[x][y+7] = c[7];
				//*/

				/*is doing this
				matrixC[x][y+0] += matrixA[x][i] * matrixB[i][y+0];
				matrixC[x][y+1] += matrixA[x][i] * matrixB[i][y+1];
				matrixC[x][y+2] += matrixA[x][i] * matrixB[i][y+2];
				matrixC[x][y+3] += matrixA[x][i] * matrixB[i][y+3];
				matrixC[x][y+4] += matrixA[x][i] * matrixB[i][y+4];
				matrixC[x][y+5] += matrixA[x][i] * matrixB[i][y+5];
				matrixC[x][y+6] += matrixA[x][i] * matrixB[i][y+6];
				matrixC[x][y+7] += matrixA[x][i] * matrixB[i][y+7];
				//*/
			}
		}
	}

	//calculate remaining columns
	for(int x{0}; x < dimension; ++x)
	for(int i{0}; i < dimension; ++i)
	for(int y{start}; y < dimension; ++y)
		matrixC[x][y] += matrixA[x][i] * matrixB[i][y];

	return matrixC;//move semantics ftw
}
예제 #25
0
파일: fma-compile.c 프로젝트: 0day-ci/gcc
__m256
check_mm256_fmadd_ps (__m256 a, __m256 b, __m256 c)
{
  return _mm256_fmadd_ps (a, b, c);
}