예제 #1
0
/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}
예제 #2
0
파일: testimm-3.c 프로젝트: 0day-ci/gcc
void
test4bit (void)
{
  d1 = _mm_round_pd (d2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  d1 = _mm_round_sd (d2, d3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ps (a2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ss (a2, a2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_blend_ps (a2, a3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_blend_pd (e2, e3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_round_pd (e2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  b1 = _mm256_round_ps (b2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
}
예제 #3
0
static void
TEST (void)
{
  __m128 x, y;
  union
    {
      __m128 x[NUM];
      float f[NUM * 4];
    } dst, src1, src2;
  union
    {
      __m128 x;
      float f[4];
    } src3;
  int i;

  init_blendps (src1.f, src2.f);

  for (i = 0; i < 4; i++)
    src3.f[i] = (int) random ();

  /* Check blendps imm8, m128, xmm */
  for (i = 0; i < NUM; i++)
    {
      dst.x[i] = _mm_blend_ps (src1.x[i], src2.x[i], MASK); 
      if (check_blendps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4]))
	abort ();
    }
    
   /* Check blendps imm8, xmm, xmm */
  x = _mm_blend_ps (dst.x[2], src3.x, MASK);
  y = _mm_blend_ps (src3.x, dst.x[2], MASK);

  if (check_blendps (&x, &dst.f[8], &src3.f[0]))
    abort ();

  if (check_blendps (&y, &src3.f[0], &dst.f[8]))
    abort ();
}
예제 #4
0
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}
예제 #5
0
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	__m128 v = _mm_add_ps(p03, p12);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(d, v2);

	v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 4, v);

	v = _mm_sub_ps(p03, p12);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 8, v);

	v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 12, v);
}
예제 #6
0
파일: SSE.hpp 프로젝트: Eynx/R3D
 // Blend together two vector's components
 template <Bool X, Bool Y, Bool Z, Bool W> inline Vector VFunction Blend(const Vector& vectorA, const Vector& vectorB)
 {
     return _mm_blend_ps(vectorA, vectorB, (X << 3) | (Y << 2) | (Z << 1) | W);
 }
예제 #7
0
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}
예제 #8
0
gboolean
gimp_operation_normal_mode_process_pixels_sse4 (gfloat              *in,
                                                gfloat              *aux,
                                                gfloat              *mask,
                                                gfloat              *out,
                                                gfloat               opacity,
                                                glong                samples,
                                                const GeglRectangle *roi,
                                                gint                 level)
{
  /* check alignment */
  if ((((uintptr_t)in) | ((uintptr_t)aux) | ((uintptr_t)out)) & 0x0F)
    {
      return gimp_operation_normal_mode_process_pixels_core (in, aux, mask, out,
                                                             opacity, samples,
                                                             roi, level);
    }
  else
    {
      const __v4sf *v_in  = (const __v4sf*) in;
      const __v4sf *v_aux = (const __v4sf*) aux;
            __v4sf *v_out = (      __v4sf*) out;

      const __v4sf one = _mm_set1_ps (1.0f);
      const __v4sf v_opacity = _mm_set1_ps (opacity);

      while (samples--)
        {
          __v4sf rgba_in, rgba_aux, alpha;

          rgba_in  = *v_in++;
          rgba_aux = *v_aux++;

          /* expand alpha */
          alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_aux,
                                             _MM_SHUFFLE (3, 3, 3, 3));

          if (mask)
            {
              __v4sf mask_alpha;

              /* multiply aux's alpha by the mask */
              mask_alpha = _mm_set1_ps (*mask++);
              alpha = alpha * mask_alpha;
            }

          alpha = alpha * v_opacity;

          if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ()))
            {
              __v4sf dst_alpha, a_term, out_pixel, out_alpha;

              /* expand alpha */
              dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in,
                                                     _MM_SHUFFLE (3, 3, 3, 3));

              /* a_term = dst_a * (1.0 - src_a) */
              a_term = dst_alpha * (one - alpha);

              /* out(color) = src * src_a + dst * a_term */
              out_pixel = rgba_aux * alpha + rgba_in * a_term;

              /* out(alpha) = 1.0 * src_a + 1.0 * a_term */
              out_alpha = alpha + a_term;

              /* un-premultiply */
              out_pixel = out_pixel / out_alpha;

              /* swap in the real alpha */
              out_pixel = _mm_blend_ps (out_pixel, out_alpha, 0x08);

              *v_out++ = out_pixel;
            }
          else
            {
              *v_out++ = rgba_in;
            }
        }
    }

  return TRUE;
}
예제 #9
0
__m128 test_blend_ps(__m128 V1, __m128 V2) {
  // CHECK-LABEL: test_blend_ps
  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
  // CHECK-ASM: blendps $6, %xmm{{.*}}, %xmm{{.*}}
  return _mm_blend_ps(V1, V2, 6);
}
예제 #10
0
void fDCT2D8x4_and_threshold_keep00_32f(const float* x, float* y, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();

	__m128 c0 = _mm_load_ps(x);
	__m128 c1 = _mm_load_ps(x + 56);
	__m128 t0 = _mm_add_ps(c0, c1);
	__m128 t7 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 48);
	c0 = _mm_load_ps(x + 8);
	__m128 t1 = _mm_add_ps(c0, c1);
	__m128 t6 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 40);
	c0 = _mm_load_ps(x + 16);
	__m128 t2 = _mm_add_ps(c0, c1);
	__m128 t5 = _mm_sub_ps(c0, c1);

	c0 = _mm_load_ps(x + 24);
	c1 = _mm_load_ps(x + 32);
	__m128 t3 = _mm_add_ps(c0, c1);
	__m128 t4 = _mm_sub_ps(c0, c1);

	/*
	c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
	c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
	c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
	c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
	*/

	c0 = _mm_add_ps(t0, t3);
	__m128 c3 = _mm_sub_ps(t0, t3);
	c1 = _mm_add_ps(t1, t2);
	__m128 c2 = _mm_sub_ps(t1, t2);

	/*
	c0 = t0 + t3; c3 = t0 - t3;
	c1 = t1 + t2; c2 = t1 - t2;
	*/

	const __m128 invsqrt2h = _mm_set_ps1(0.353554f);

	__m128 v = _mm_mul_ps(_mm_add_ps(c0, c1), invsqrt2h);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(y, v2);

	v = _mm_mul_ps(_mm_sub_ps(c0, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 32, v);

	/*y[0] = c0 + c1;
	y[4] = c0 - c1;*/

	__m128 w0 = _mm_set_ps1(0.541196f);
	__m128 w1 = _mm_set_ps1(1.306563f);
	v = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(w0, c2), _mm_mul_ps(w1, c3)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 16, v);

	v = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(w0, c3), _mm_mul_ps(w1, c2)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 48, v);
	/*
	y[2] = c2 * r[6] + c3 * r[2];
	y[6] = c3 * r[6] - c2 * r[2];
	*/

	w0 = _mm_set_ps1(1.175876f);
	w1 = _mm_set_ps1(0.785695f);
	c3 = _mm_add_ps(_mm_mul_ps(w0, t4), _mm_mul_ps(w1, t7));
	c0 = _mm_sub_ps(_mm_mul_ps(w0, t7), _mm_mul_ps(w1, t4));
	/*
	c3 = t4 * r[3] + t7 * r[5];
	c0 = t7 * r[3] - t4 * r[5];
	*/

	w0 = _mm_set_ps1(1.387040f);
	w1 = _mm_set_ps1(0.275899f);
	c2 = _mm_add_ps(_mm_mul_ps(w0, t5), _mm_mul_ps(w1, t6));
	c1 = _mm_sub_ps(_mm_mul_ps(w0, t6), _mm_mul_ps(w1, t5));
	/*
	c2 = t5 * r[1] + t6 * r[7];
	c1 = t6 * r[1] - t5 * r[7];
	*/

	v = _mm_mul_ps(_mm_sub_ps(c0, c2), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 24, v);

	v = _mm_mul_ps(_mm_sub_ps(c3, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 40, v);
	//y[5] = c3 - c1; y[3] = c0 - c2;

	const __m128 invsqrt2 = _mm_set_ps1(0.707107f);
	c0 = _mm_mul_ps(_mm_add_ps(c0, c2), invsqrt2);
	c3 = _mm_mul_ps(_mm_add_ps(c3, c1), invsqrt2);
	//c0 = (c0 + c2) * invsqrt2;
	//c3 = (c3 + c1) * invsqrt2;

	v = _mm_mul_ps(_mm_add_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 8, v);

	v = _mm_mul_ps(_mm_sub_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 56, v);
	//y[1] = c0 + c3; y[7] = c0 - c3;

	/*for(i = 0;i < 8;i++)
	{
	y[i] *= invsqrt2h;
	}*/
}
예제 #11
0
__m128 test_blend_ps(__m128 V1, __m128 V2) {
  // CHECK-LABEL: @test_blend_ps
  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
  return _mm_blend_ps(V1, V2, 5);
}
예제 #12
0
파일: main.cpp 프로젝트: minh0722/HPC2015
static void bar( // bitonic_simd_sort
                float (& inout)[8]) {
    
    const __m128 r0_in0 = _mm_load_ps(inout + 0); // 0, 1, 2, 3
    const __m128 r0_in1 = _mm_load_ps(inout + 4); // 4, 5, 6, 7
    
    // stage 0
    const __m128 r0_A = _mm_shuffle_ps(r0_in0, r0_in1, 0xcc); // 0, 3, 4, 7
    const __m128 r0_B = _mm_shuffle_ps(r0_in0, r0_in1, 0x99); // 1, 2, 5, 6
    
    const __m128 r0_min = _mm_min_ps(r0_A, r0_B); // 0, 3, 4, 7
    const __m128 r0_max = _mm_max_ps(r0_A, r0_B); // 1, 2, 5, 6
    
    // stage 1
    __m128 r1_A = _mm_shuffle_ps(r0_max, r0_max, 0xf0); // 1, 1, 6, 6
    __m128 r1_B = _mm_shuffle_ps(r0_max, r0_max, 0xa5); // 2, 2, 5, 5
    r1_A = _mm_blend_ps(r1_A, r0_min, 0x9);      // 0, 1, 6, 7
    r1_B = _mm_blend_ps(r1_B, r0_min, 0x6);      // 2, 3, 4, 5
    
    const __m128 r1_min = _mm_min_ps(r1_A, r1_B); // 0, 1, 6, 7
    const __m128 r1_max = _mm_max_ps(r1_A, r1_B); // 2, 3, 4, 5
    
    // stage 2
    __m128 r2_A = _mm_shuffle_ps(r1_max, r1_max, 0xf0); // 2, 2, 5, 5
    __m128 r2_B = _mm_shuffle_ps(r1_min, r1_min, 0xa5); // 1, 1, 6, 6
    r2_A = _mm_blend_ps(r2_A, r1_min, 0x9);      // 0, 2, 5, 7
    r2_B = _mm_blend_ps(r2_B, r1_max, 0x6);      // 1, 3, 4, 6
    
    const __m128 r2_min = _mm_min_ps(r2_A, r2_B); // 0, 2, 5, 7
    const __m128 r2_max = _mm_max_ps(r2_A, r2_B); // 1, 3, 4, 6
    
    // stage 3
    const __m128 r3_A = _mm_unpacklo_ps(r2_min, r2_max); // 0, 1, 2, 3
    const __m128 r3_B = _mm_unpackhi_ps(r2_max, r2_min); // 4, 5, 6, 7
    
    const __m128 r3_min = _mm_min_ps(r3_A, r3_B); // 0, 1, 2, 3
    const __m128 r3_max = _mm_max_ps(r3_A, r3_B); // 4, 5, 6, 7
    
    // stage 4
    const __m128 r4_A = _mm_movelh_ps(r3_min, r3_max); // 0, 1, 4, 5
    const __m128 r4_B = _mm_movehl_ps(r3_max, r3_min); // 2, 3, 6, 7
    
    const __m128 r4_min = _mm_min_ps(r4_A, r4_B); // 0, 1, 4, 5
    const __m128 r4_max = _mm_max_ps(r4_A, r4_B); // 2, 3, 6, 7
    
    // stage 5
    const __m128 r5_a = _mm_unpacklo_ps(r4_min, r4_max); // 0, 2, 1, 3
    const __m128 r5_b = _mm_unpackhi_ps(r4_min, r4_max); // 4, 6, 5, 7
    const __m128 r5_A = _mm_movelh_ps(r5_a, r5_b); // 0, 2, 4, 6
    const __m128 r5_B = _mm_movehl_ps(r5_b, r5_a); // 1, 3, 5, 7
    
    const __m128 r5_min = _mm_min_ps(r5_A, r5_B); // 0, 2, 4, 6
    const __m128 r5_max = _mm_max_ps(r5_A, r5_B); // 1, 3, 5, 7
    
    // output
    const __m128 out0 = _mm_unpacklo_ps(r5_min, r5_max); // 0, 1, 2, 3
    const __m128 out1 = _mm_unpackhi_ps(r5_min, r5_max); // 4, 5, 6, 7
    
    _mm_store_ps(inout + 0, out0);
    _mm_store_ps(inout + 4, out1);
}