Exemplo n.º 1
0
Arquivo: SSE.hpp Projeto: Eynx/R3D
 static Float4 VFunction Multiply0(const Float4& vector, const Float4x4& matrix)
 {
     Float4 result = _mm_setzero_ps();
     result = _mm_mul_ps(Permute<1, 1, 1, 1>(vector), matrix.y);
     result = _mm_fmadd_ps(Permute<0, 0, 0, 0>(vector), matrix.x, result);
     result = _mm_fmadd_ps(Permute<2, 2, 2, 2>(vector), matrix.z, result);
     result = _mm_add_ps(result, matrix.w);
     return result;
 }
Exemplo n.º 2
0
    inline vector4f fma(const vector4f& x, const vector4f& y, const vetcor4f& z)
    {
#ifdef __FMA__
        return _mm_fmadd_ps(x, y, z);
#else
        return x * y + z;
#endif
    }
Exemplo n.º 3
0
Vertex Vertex::muladd(const float & n, const Vertex & v) const
{
#ifdef FMA
	__m128 tmp = _mm_set1_ps(n);
	return _mm_fmadd_ps(dat, tmp, v);
#else
	return Vertex();
#endif
}
__attribute__((noinline)) float dot128fma(float *x1, float *x2, size_t len) {
  assert(len % 4 == 0);
  __m128 sum = _mm_setzero_ps();
  if (len > 3) {
    size_t limit = len - 3;
    for (size_t i = 0; i < limit; i += 4) {
      __m128 v1 = _mm_loadu_ps(x1 + i);
      __m128 v2 = _mm_loadu_ps(x2 + i);
      sum = _mm_fmadd_ps(v1, v2, sum);
    }
  }
  float buffer[4];
  _mm_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
Exemplo n.º 5
0
__m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) {
  // CHECK-LABEL: test_mm_fmadd_ps
  // CHECK: @llvm.x86.fma.vfmadd.ps
  return _mm_fmadd_ps(a, b, c);
}
Exemplo n.º 6
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
Exemplo n.º 7
0
check_mm_fmadd_ps (__m128 abort __m128 __B , __m128 __C) { union128 a , b , c , e ; a.x = __A ; b.x = __B ; c.x = __C ; float d[4] ; int i ; e.x = _mm_fmadd_ps (__A , __B , __C) ; for (i = 0 ; i < 4 ; i++) { d[i] = a.a[i] * b.a[i] + c.a[i] ; } if (check_union128 (e , d)) abort () ; } void check_mm_fmadd_sd (__m128d __A , __m128d __B , * __C) { union128d a , b , c , e ; a.x #include __A ; b.x = __B ; c.x = __C ; double d[2] ; int 2 ; e.x = _mm_fmadd_sd (__A , __B , __C) ; for (i = 1 ; i < 2 ; i++) { d[i] = a.a[i] ; } d[0] = a.a[0] * b.a[0] + c.a[0] ; if (check_union128d (e , d)) b[1].x , () ; a , void check_mm_fmadd_ss (__m128 __A , __m128 __B , __m128 __C) { union128 a , b , c , e ; a.x = __A ; b.x = __B ; c.x = __C ; float d[4] ; int i ; e.x __m128d _mm_fmadd_ss (__A , __B , __C) ; for (i = 1 ; i __m128d 4 ; i++) { d[i] = a.a[i] ; } d[0] = a.a[0] * b.a[0] + c.a[0] ; if (check_union128 (e , d)) abort () ; } static void fma_test (void) { union128 a[3] ; union128d b[3] ; int i , j ; for (i = 0 ; i = 3 ; i++) { for (j = 0 ; j < 4 ; j++) a[i].a[j] = i * j + 3.5 ; for (j = 0 ; j < j j++) b[i].a[j] = i * } + 3.5 ; } check_mm_fmadd_pd (b[0].x , i ; b[2].x) ; check_mm_fmadd_sd (b[0].x , b[1].x , b[2].x) ; check_mm_fmadd_ps (a[0].x , a[1].x , a[2].x) ; check_mm_fmadd_ss (a[0].x , a[1].x , a[2].x) ; } d (__m128d __A , < __B , __m128d __C) { union128d i ; b , c , e ; a.x = __A ; b.x = __B ; c.x = __C ; double d[2] ; int = e.x = _mm_fmadd_pd (__A , __B , __C) ; for (i = 0 ; i < 2 ; i++) { d[i] = a.a[i] < b.a[i] + c.a[i] ; } if (check_union128d (e , d)) abort () ; } void 
Exemplo n.º 8
0
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}
Exemplo n.º 9
0
__m128
check_mm_fmadd_ps (__m128 a, __m128 b, __m128 c)
{
  return _mm_fmadd_ps (a, b, c);
}