Пример #1
0
float vec3::dot(const vec3 &b) const {
    // make sure w component is 0
    __m128 temp = _mm_mul_ps(v, b.v);
    __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFE);
    temp2 = _mm_add_ps(temp, temp2);
    return _mm_cvtss_f32(_mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFD)));
}
Пример #2
0
float vec3::length() const {
    __m128 temp = _mm_mul_ps(v, v);
    __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFD);
    temp2 = _mm_add_ps(temp, temp2);
    temp2 = _mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFE));
    return _mm_cvtss_f32(_mm_sqrt_ps(temp2));
}
Пример #3
0
float Float4::Dot(float x1, float y1, float z1, float w1, float x2, float y2, float z2, float w2) {
	// Task 8: replace with SSE
	__m128 v1 = _mm_setr_ps(x1, y1, z1, w1);
	__m128 v2 = _mm_setr_ps(x2,y2,z2,w2);
	__m128 dpResult = _mm_dp_ps(v1, v2, 0xf1);
	return _mm_cvtss_f32(dpResult);
	//return x1*x2 + y1*y2 + z1*z2 + w1*w2;
}
Пример #4
0
int dihedral(const float* xyz, const int* quartets, float* out,
          const int n_frames, const int n_atoms, const int n_quartets) {
  /* Compute the angle between sets of four atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     quartets : array, shape=(n_quartets, 3)
         The specific quartet of atoms whose angle you want to compute. The
         angle computed will be the torsion around the bound between the
         middle two elements (i.e aABCD). A 2d array of indices, in C order.
     out : array, shape=(n_frames, n_pairs)
         Array where the angles will be stored, in contiguous C order.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */

  int i, j;
  __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_quartets; j++) {
      x0 = load_float3(xyz + 3*quartets[4*j + 0]);
      x1 = load_float3(xyz + 3*quartets[4*j + 1]);
      x2 = load_float3(xyz + 3*quartets[4*j + 2]);
      x3 = load_float3(xyz + 3*quartets[4*j + 3]);

      b1 = _mm_sub_ps(x1, x0);
      b2 = _mm_sub_ps(x2, x1);
      b3 = _mm_sub_ps(x3, x2);

      c1 = cross(b2, b3);
      c2 = cross(b1, b2);

      p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71)));
      p2 = _mm_dp_ps(c1, c2, 0x71);

      *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2));
    };
    xyz += n_atoms*3;
  }
  return 1;
}
Пример #5
0
inline float Sqrt(const float &sqr) // #include <xmmintrin.h>
{
	__m128 mm1;

	mm1 = _mm_set_ss(sqr);
	mm1 = _mm_sqrt_ss(mm1);

	return _mm_cvtss_f32(mm1);
}
Пример #6
0
static void 
mexsoftmax(float* y, float* shift, mwSize m, mwSize n) {
  __m128 i1, i2;
  __m128 o1, o2;
 
  while (m>0)
    {
      mwSize curn = n;
      float sum = 0.0f;
      declconst128(zero, 0.0f);
      
      while (curn>0 && ((unsigned long)(y+curn) & 15) != 0)
        {
          --curn;
          y[curn]=fastexp(y[curn]-*shift);
          sum += y[curn];
        }

      __m128 s1 = _mm_load1_ps (shift);
      __m128 sum1 = zero;

      while (curn>7) {
        i1 = _mm_load_ps (y+curn-4);
        i2 = _mm_load_ps (y+curn-8);
        i1 = _mm_sub_ps (i1, s1);
        i2 = _mm_sub_ps (i2, s1);
        o1 = vfastexp(i1);
        o2 = vfastexp(i2);
        _mm_store_ps (y+curn-4, o1);
        sum1 = _mm_add_ps (sum1, o1);
        _mm_store_ps (y+curn-8, o2);
        sum1 = _mm_add_ps (sum1, o2);
        curn-=8;
      }

      sum1 = _mm_hadd_ps (sum1, sum1);
      sum1 = _mm_hadd_ps (sum1, sum1);
      sum += _mm_cvtss_f32 (sum1);
     
      while(curn>0) {
        --curn;
        y[curn]=fastexp(y[curn]-*shift);
        sum += y[curn];
      }

      sum = 1.0f / sum;

      ptrdiff_t n_pdt = n;
      ptrdiff_t one_pdt = 1;

      sscal (&n_pdt, &sum, y, &one_pdt);

      ++shift;
      y+=n;
      --m;
    }
}
Пример #7
0
float Vertex::length() const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, dat, 0b01110001);
	return _mm_cvtss_f32(_mm_sqrt_ss(ans));
#else
	return sqrt(x*x + y*y + z*z);
#endif
}
Пример #8
0
float Vertex::length_sqr() const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, dat, 0b01110001);
	return _mm_cvtss_f32(ans);
#else
	return x*x + y*y + z*z;
#endif
}
Пример #9
0
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y)
{ 
#ifdef AVX
	//Black magic
	//But it does produce the same results as the not AVX code
	__m256 accumulator;
	__m256 x_s = _mm256_load_ps(x->Features);
	__m256 y_s = _mm256_load_ps(y->Features);
	__m256 result = _mm256_sub_ps(x_s, y_s);
	accumulator = _mm256_mul_ps(result, result);

	x_s = _mm256_load_ps(&x->Features[8]);
	y_s = _mm256_load_ps(&y->Features[8]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[16]);
	y_s = _mm256_load_ps(&y->Features[16]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[24]);
	y_s = _mm256_load_ps(&y->Features[24]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);
	//We now have a vector of 8 floats

	__m256 t1 = _mm256_hadd_ps(accumulator, accumulator);
	__m256 t2 = _mm256_hadd_ps(t1, t1);
	__m128 t3 = _mm256_extractf128_ps(t2, 1);
	__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
	//And now we don't
	return std::sqrtf(_mm_cvtss_f32(t4));
#endif
#ifndef AVX
	//Can be autovectorized
	float accumulator[32];
	float distance = 0;
	for (int i = 0; i < 30; i++)
	{
		accumulator[i] = x->Features[i] - y->Features[i];
	}

	//If done properly this should be 4(8) instructions
	for (int i = 0; i < 30; i++)
	{
		distance += accumulator[i] * accumulator[i];
	}

	return std::sqrtf(distance);
#endif

	
}
Пример #10
0
float Vertex::operator&(const Vertex &v) const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, v.dat, 0b01110001);
	return _mm_cvtss_f32(ans);
#else
	return x*v.x + y*v.y + z*v.z;
#endif
}
Пример #11
0
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f);
	const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f);
	const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f);

	const __m128 _ps_pi    = _mm_set1_ps(pi);
	const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128 mm1, mm2, mm3;
	__m128 axm, aym;

	__m128 xm = _mm_set1_ps(x);
	__m128 ym = _mm_set1_ps(y);

	axm = _mm_and_ps(xm, _mask_sign_inv);
	aym = _mm_and_ps(ym, _mask_sign_inv);

	mm1 = _mm_min_ps(axm, aym);
	mm2 = _mm_max_ps(axm, aym);
	mm1 = _mm_div_ps(mm1, mm2);
	mm2 = _mm_mul_ps(mm1, mm1);

	mm3 = _mm_mul_ps(mm2, _ps_atan_p0);
	mm3 = _mm_add_ps(mm3, _ps_atan_p1);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_sub_ps(mm3, _ps_atan_p2);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_mul_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	__m128 mask;

	/* |y| > |x| */
	mask = _mm_cmpgt_ss(aym, axm);
	mm2 = _mm_and_ps(_ps_pi0p5, mask);
	mm1 = _mm_and_ps(_mask_sign_raw, mask);
	mm3 = _mm_xor_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm2);

	/* x < 0 */
	mask = _mm_and_ps(xm, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mask);
	mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30));
	mm1 = _mm_and_ps(_ps_pi, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	/* y < 0 */
	mm1 = _mm_and_ps(ym, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mm1);

	return _mm_cvtss_f32(mm3);
}
Пример #12
0
irreg_poly_area_func_sign(float, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256
        values_0_3,
        values_4_7,
        values_8_11,
        values_12_15,
        values_16_19 = _mm256_load_ps((const float *)&cords[0][0]),
        accum_sum = _mm256_setzero_ps();
    float accum_sum_aux;

    #define _float_cords_dot_prod(curr, next, index)                    \
        _mm256_dp_ps(                                                   \
            curr,                                                       \
            _mm256_xor_ps(                                              \
                _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\
                _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f)  \
            ),                                                          \
            0b11110000 | (1 << (index))                                 \
        )


    unsigned long index;
    for (index = 0; index < (cords_len - 16); index += 16) {
        values_0_3   = values_16_19;
        values_4_7   = _mm256_load_ps((const float *)&cords[index + 4]);
        values_8_11  = _mm256_load_ps((const float *)&cords[index + 8]);
        values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]);
        values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]);

        accum_sum = _mm256_add_ps(
            accum_sum,
            _mm256_add_ps(
                _mm256_add_ps(
                    _float_cords_dot_prod(values_0_3, values_4_7, 0),
                    _float_cords_dot_prod(values_4_7, values_8_11, 1)
                ),
                _mm256_add_ps(
                    _float_cords_dot_prod(values_8_11, values_12_15, 2),
                    _float_cords_dot_prod(values_12_15, values_16_19, 3)
                )
            )
        );
    }

    accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ...
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ...
    for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Пример #13
0
static double rcp_d(double x)
{
    __m128d xd = _mm_load_sd(&x);
    double xi = _mm_cvtss_f32(_mm_rcp_ss(_mm_cvtsd_ss(_mm_setzero_ps(), xd)));

    xi = xi + xi * (1.0 - x * xi);
    xi = xi + xi * (1.0 - x * xi);

    return xi;
}
Пример #14
0
    inline float hadd(const vector4f& rhs)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        __m128 tmp0 = _mm_hadd_ps(rhs, rhs);
        __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
#else
        __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs));
        __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
#endif
        return _mm_cvtss_f32(tmp1);
    }
Пример #15
0
    _XOINL float QuaternionSquareSum(const Quaternion& q)
    {
#if defined(XO_SSE)
        __m128 square = _mm_mul_ps(q.xmm, q.xmm);
        square = _mm_hadd_ps(square, square);
        square = _mm_hadd_ps(square, square);
        return _mm_cvtss_f32(square);
#else
        return q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w;
#endif
    }
Пример #16
0
//Thanks stack overflow.
static inline float _mm256_reduce_add_ps(__m256 x) {
    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
    const int imm = 1;
    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x));
    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
    /* Conversion to float is a no-op on x86-64 */
    return _mm_cvtss_f32(x32);
}
    inline
    float operator[](int i) const
    {
        __m512 buf0;
        if (i < 16) {
            buf0 = val1;
        } else {
            buf0 = val2;
        }

        i &= 15;

        __m128 buf1;
        if (i < 8) {
            if (i < 4) {
                buf1 =  _mm512_extractf32x4_ps(buf0, 0);
            } else {
                buf1 =  _mm512_extractf32x4_ps(buf0, 1);
            }
        } else {
            if (i < 12)  {
                buf1 =  _mm512_extractf32x4_ps(buf0, 2);
            } else {
                buf1 =  _mm512_extractf32x4_ps(buf0, 3);
            }
        }

        i &= 3;

        if (i == 3) {
            return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 3));
        }
        if (i == 2) {
            return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 2));
        }
        if (i == 1) {
            return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 1));
        }

        return _mm_cvtss_f32(buf1);
    }
Пример #18
0
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
static inline float horizontal_sum_avx2(__m256 x)
{
  const __m128 hi_quad = _mm256_extractf128_ps(x, 1);
  const __m128 lo_quad = _mm256_castps256_ps128(x);
  const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad);
  const __m128 lo_dual = sum_quad;
  const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad);
  const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual);
  const __m128 lo = sum_dual;
  const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1);
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}
Пример #19
0
int angle(const float* xyz, const int* triplets, float* out,
          const int n_frames, const int n_atoms, const int n_angles) {
  /* Compute the angle between tripples of atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     triplets : array, shape=(n_angles, 3)
         The specific tripple of atoms whose angle you want to compute. The
         angle computed will be centered around the middle element (i.e aABC).
         A 2d array of indices, in C order.
     out : array, shape=(n_frames, n_pairs)
         Array where the angles will be stored, in contiguous C order.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */

  int i, j;
  __m128 r_m, r_n, r_o, u_prime, u, v_prime, v;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_angles; j++) {
      r_m = load_float3(xyz + 3*triplets[3*j + 0]);
      r_o = load_float3(xyz + 3*triplets[3*j + 1]);
      r_n = load_float3(xyz + 3*triplets[3*j + 2]);

      u_prime = _mm_sub_ps(r_m, r_o);
      v_prime = _mm_sub_ps(r_n, r_o);

      // normalize the vectors u_prime and v_prime
      u = _mm_mul_ps(u_prime, _mm_rsqrt_ps(_mm_dp_ps(u_prime, u_prime, 0x7F)));
      v = _mm_mul_ps(v_prime, _mm_rsqrt_ps(_mm_dp_ps(v_prime, v_prime, 0x7F)));

      // compute the arccos of the dot product, and store the result.
      *(out++) = acos(_mm_cvtss_f32(_mm_dp_ps(u, v, 0x71)));
    }
    // advance to the next frame
    xyz += n_atoms*3;
  }

  return 1;
}
Пример #20
0
    float DotProductSIMD(const float* a, const float* b, std::size_t n) {
        std::size_t i = 0;

        __m128 sum =  _mm_setzero_ps();
        for(; i < ROUND_DOWN(n, 4); i += 4) {
            __m128 x = _mm_loadu_ps(a + i);
            __m128 y = _mm_loadu_ps(a + i);
            x = _mm_mul_ps(x, y);
            sum = _mm_add_ps(x, sum);
        }

        sum = _mm_hadd_ps(sum, sum);
        sum = _mm_hadd_ps(sum, sum);
        float product = _mm_cvtss_f32(sum);

        for(; i < n; i++) {
            product += a[i] * b[i];
        }
        return product;
    }
Пример #21
0
Normal::Normal(const Vertex &v)//¹éÒ»»¯
{
#ifdef AVX2
	__m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001);
	__m128 tmp = _mm_broadcastss_ps(_mm_sqrt_ss(ans));
	dat = _mm_div_ps(v.dat, tmp);
#else
#    ifdef SSE4
	__m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001);
	ans = _mm_sqrt_ss(ans);
	__m128 tmp = _mm_set1_ps(_mm_cvtss_f32(ans));
	dat = _mm_div_ps(v.dat, tmp);
#    else
	float s = v.x*v.x + v.y*v.y + v.z*v.z;
	s = 1 / sqrt(s);
	x = v.x * s;
	y = v.y * s;
	z = v.z * s;
#    endif
#endif
}
Пример #22
0
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
inline float sum8(__m256 x) 
{
  // hiQuad = ( x7, x6, x5, x4 )
  const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
  // loQuad = ( x3, x2, x1, x0 )
  const __m128 loQuad = _mm256_castps256_ps128(x);
  // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
  const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
  // loDual = ( -, -, x1 + x5, x0 + x4 )
  const __m128 loDual = sumQuad;
  // hiDual = ( -, -, x3 + x7, x2 + x6 )
  const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
  // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
  const __m128 sumDual = _mm_add_ps(loDual, hiDual);
  // lo = ( -, -, -, x0 + x2 + x4 + x6 )
  const __m128 lo = sumDual;
  // hi = ( -, -, -, x1 + x3 + x5 + x7 )
  const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
  // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}
Пример #23
0
int test_sqrt()
{
	int Error(0);

#	if GLM_ARCH & GLM_ARCH_SSE2_BIT
	for(float f = 0.1f; f < 30.0f; f += 0.1f)
	{
		float r = _mm_cvtss_f32(_mm_sqrt_ps(_mm_set1_ps(f)));
		float s = std::sqrt(f);
		Error += glm::abs(r - s) < 0.01f ? 0 : 1;
		assert(!Error);
	}
#	endif//GLM_ARCH & GLM_ARCH_SSE2_BIT

	float A = glm::sqrt(10.f);
	glm::vec1 B = glm::sqrt(glm::vec1(10.f));
	glm::vec2 C = glm::sqrt(glm::vec2(10.f));
	glm::vec3 D = glm::sqrt(glm::vec3(10.f));
	glm::vec4 E = glm::sqrt(glm::vec4(10.f));

	return Error;
}
Пример #24
0
static void
matvec_sse()
{
        /* Assume that the data size is an even multiple of the 128 bit
         * SSE vectors (i.e. 4 floats) */
        assert(!(SIZE & 0x3));

        /* TASK: Implement your SSE version of the matrix-vector
         * multiplication here.
         */
        /* HINT: You might find at least the following instructions
         * useful:
         *  - _mm_setzero_ps
         *  - _mm_load_ps
         *  - _mm_hadd_ps
         *  - _mm_cvtss_f32
         *
         * HINT: You can create the sum of all elements in a vector
         * using two hadd instructions.
         */

        __m128 dummy=_mm_setzero_ps();
        for(int i=0;i<SIZE;++i){
            __m128 temp=_mm_setzero_ps();
            for(int j=0;j<SIZE;j+=4){

                __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j));
                __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j)));
                __m128 out=_mm_mul_ps(mm_vec_b,mm_matr);
                temp=_mm_add_ps(temp,out);

//                vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1));
            }
            __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy);
            vec_c[i]=_mm_cvtss_f32(res);
        }

}
Пример #25
0
/**
 * Identify bends in the chain, where the kappa angle (virtual bond angle from
 * c-alpha i-2, to i, to i+2) is greater than 70 degrees
 * dssp-2.2.0/structure.cpp:1729
 */
static std::vector<int> calculate_bends(const float* xyz, const int* ca_indices,
    const int* chain_ids, const int n_residues, std::vector<int>& skip)
{
    __m128 prev_ca, this_ca, next_ca, u_prime, v_prime, u, v;
    float kappa;
    std::vector<int> is_bend(n_residues, 0);
    for (int i = 2; i < n_residues-2; i++) {
        if (chain_ids[i-2] == chain_ids[i+2] && !skip[i-2] && !skip[i] && !skip[i+2]) {
            prev_ca = load_float3(xyz + 3*ca_indices[i-2]);
            this_ca = load_float3(xyz + 3*ca_indices[i]);
            next_ca = load_float3(xyz + 3*ca_indices[i+2]);
            u_prime = _mm_sub_ps(prev_ca, this_ca);
            v_prime = _mm_sub_ps(this_ca, next_ca);
            /* normalize the vectors u_prime and v_prime */
            u = _mm_div_ps(u_prime, _mm_sqrt_ps(_mm_dp_ps2(u_prime, u_prime, 0x7F)));
            v = _mm_div_ps(v_prime, _mm_sqrt_ps(_mm_dp_ps2(v_prime, v_prime, 0x7F)));
            /* compute the arccos of the dot product. this gives the angle */
            kappa = (float) acos(CLIP(_mm_cvtss_f32(_mm_dp_ps2(u, v, 0x71)), -1, 1));
            is_bend[i] = kappa > (70 * (M_PI / 180.0));
        }
    }
    return is_bend;
}
Пример #26
0
static float ks_donor_acceptor(const float* xyz, const float* hcoords,
			       const int* nco_indices, int donor, int acceptor)
{
  /* Conpute the Kabsch-Sander hydrogen bond energy between two residues
     in a single conformation.

     Parameters
     ----------
     xyz : array, shape=(n_atoms, 3)
         All of the atoms in this frame
     nhco0 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in one residue.
     nhco1 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in the other residue.
     donor : int
         Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we
         look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor.

     Returns
     -------
     energy : float
         The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5
         is considered significant.
  */
  float energy;
  __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno;
  __m128 coupling;

  // 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A)
  coupling = _mm_setr_ps(-2.7888, -2.7888, 2.7888, 2.7888);
  r_n = load_float3(xyz + 3*nco_indices[3*donor]);
  r_h = load_float3(hcoords + 3*donor);
  r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]);
  r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]);

  //printf("Donor Index %d\n", donor);
  //printf("Acceptor Index %d\n", acceptor);
  /*printf("N index %d\n", 3*nco_indices[3*donor + 0]);
  printf("C index %d\n", 3*nco_indices[3*acceptor + 1]);
  printf("O index %d\n", 3*nco_indices[3*acceptor + 2]);
  printf("\nrN ");
  printf_m128(r_n);
  printf("rH ");
  printf_m128(r_h);
  printf("rC ");
  printf_m128(r_c);
  printf("rO ");
  printf_m128(r_o);*/

  r_ho = _mm_sub_ps(r_h, r_o);
  r_hc = _mm_sub_ps(r_h, r_c);
  r_nc = _mm_sub_ps(r_n, r_c);
  r_no = _mm_sub_ps(r_n, r_o);

  // compute all four dot products (each of the squared distances), and then
  // pack them into a single float4 using three shuffles.
  d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps(r_ho, r_ho, 0xF3), _mm_dp_ps(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _mm_shuffle_ps(_mm_dp_ps(r_hc, r_hc, 0xF3), _mm_dp_ps(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _MM_SHUFFLE(2,0,2,0));

  energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF));
  //printf("Energy: %f\n\n", energy);
  return (energy < -9.9f ? -9.9f : energy);
}
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
void sgemm( int m_a, int n_a, float *A, float *B, float *C ) {

  int mpad = m_a%STEPM ? (m_a/STEPM+1)*STEPM:m_a;
  int npad = n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a;
  int mbackup = m_a;

  float* Apad=malloc(mpad*npad*sizeof(float));
  transposeA(n_a, m_a, npad, mpad, Apad, A);
  A=Apad;
  float* Bpad=malloc(npad*mpad*sizeof(float));
  transposeB(n_a, m_a, npad, mpad, Bpad, B);
  B=Bpad;            
 
  float* Cpad = malloc(mpad*mpad*sizeof(float));
  float* backup = C;
  C = Cpad;

  m_a = mpad;
  n_a = npad;

#pragma omp parallel 
  {
    //    __m128 c0,a1, c1, a2, c2, a3, c3, a4, c4;
    __m128 a1, a2, a3, a4, c0;
    __m128 c11, c12, c13, c14;
    __m128 c21, c22, c23, c24;
    __m128 c31, c32, c33, c34;
    __m128 c41, c42, c43, c44;

    __m128 b1, b2, b3, b4;
    float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8;
    int ii=0;
    int jj=0;
    int kk=0;

    int kkma=0;
    int jjna=0;
    int jjma=0; 
    int iina=0;

#pragma omp for
    for( int j = 0; j < m_a; j+=4 ) {
      jj=j;

      jjma=jj*m_a;
      jjna=jj*n_a;

      for( int i = 0; i < m_a; i+=4 ) {
	ii=i;

	iina=ii*n_a;

	c31=c32=c33=c34=c41=c42=c43=c44=c11=c12=c13=c14=c21=c22=c23=c24 = _mm_setzero_ps();
 
	for( int k = 0; k < n_a; k+=4 ) {
	  float* tempA=A+k+iina;
	  float* tempB=B+k+jjna;

	  b1 = _mm_loadu_ps(tempB);
	  b2 = _mm_loadu_ps(tempB+n_a);
	  b3 = _mm_loadu_ps(tempB+2*n_a);
	  b4 = _mm_loadu_ps(tempB+3*n_a);
	  /////////////////////////////////////////
	  a1 = _mm_loadu_ps(tempA);
	  a2 = _mm_loadu_ps(tempA+n_a);
	  a3 = _mm_loadu_ps(tempA+n_a*2);
	  a4 = _mm_loadu_ps(tempA+n_a*3);

	  c11=_mm_add_ps(c11,  _mm_mul_ps(a1, b1));
	  c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1));

	  c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2));
	  c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2));

	  c13= _mm_add_ps(c13,  _mm_mul_ps(a1, b3));
	  c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3));

	  c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4));
	  c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4));

	  c31=_mm_add_ps(c31,  _mm_mul_ps(a3, b1));
	  c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1));

	  c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2));
	  c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2));

	  c33= _mm_add_ps(c33,  _mm_mul_ps(a3, b3));
	  c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3));

	  c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4));
	  c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4));
	}

	c0=  _mm_hadd_ps(c11,c11);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c12,c12);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c13,c13);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c14,c14);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c21,c21);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c22,c22);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c23,c23);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c24,c24);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c31,c31);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c32,c32);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c33,c33);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c34,c34);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c41,c41);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c42,c42);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c43,c43);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c44,c44);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0);
	/*
	c0=  _mm_hadd_ps(c11,c11);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma] = _mm_cvtss_f32(c0);

	c11=  _mm_hadd_ps(c12,c12);
	c11=  _mm_hadd_ps(c11,c11);
	C[ii+jjma+m_a]= _mm_cvtss_f32(c11); 

	c12=  _mm_hadd_ps(c13,c13);
	c12=  _mm_hadd_ps(c12,c12);   
	C[ii+jjma+m_a*2]=_mm_cvtss_f32(c12);

	c13=  _mm_hadd_ps(c14,c14);
	c13=  _mm_hadd_ps(c13,c13);   
	C[ii+jjma+m_a*3]= _mm_cvtss_f32(c13);

	c14=  _mm_hadd_ps(c21,c21);
	c14=  _mm_hadd_ps(c14,c14); 
	C[ii+jjma+1] = _mm_cvtss_f32(c14);

	c21=  _mm_hadd_ps(c22,c22);
	c21=  _mm_hadd_ps(c21,c21);
	C[ii+jjma+m_a+1] = _mm_cvtss_f32(c21); 

	c22=  _mm_hadd_ps(c23,c23);
	c22=  _mm_hadd_ps(c22,c22);   
	C[ii+jjma+2*m_a+1]=_mm_cvtss_f32(c22);

	c23=  _mm_hadd_ps(c24,c24);
	c23=  _mm_hadd_ps(c23,c23);   
	C[ii+jjma+3*m_a+1]= _mm_cvtss_f32(c23);
	*/
      }
    } 
  }

  move(mbackup, mpad, C, backup);
  free(A);
  free(B);
  free(C);
}             
Пример #29
0
 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
     const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
     const __m128 x64  = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
     const __m128 x32  = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
     return _mm_cvtss_f32(x32);
 }
Пример #30
0
static inline float rsqrt_fast(const float x)
{
  const __m128 a = _mm_set_ss(x);
  const __m128 r = _mm_rsqrt_ps(a);
  return _mm_cvtss_f32(r);
}