Beispiel #1
0
Datei: main.cpp Projekt: sclc/DPP
//-----------------------------------------------------------------
// AOS -> SOA
//
//    pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ...
// ->
//    pBlu:  b0, b1, b2, b3, b4, ...
//    pGrn:  g0, g1, g2, g3, g4, ...
//    pRed:  r0, r1, r2, r3, r4, ...
void
aos2soa(float *pBgr, float *pBlu, float *pGrn, float *pRed, const size_t length)
{
    __m128 *bgr = (__m128 *)pBgr;
    float *b = pBlu;
    float *g = pGrn;
    float *r = pRed;

    for (size_t i = 0; i < length; i += 24, b += 8, g += 8, r += 8)
    {
        __m256 m03 = _mm256_castps128_ps256(*bgr++);  // 下半分のロード
        __m256 m14 = _mm256_castps128_ps256(*bgr++);
        __m256 m25 = _mm256_castps128_ps256(*bgr++);
        m03 = _mm256_insertf128_ps(m03, *bgr++, 1);   // 上半分のロード
        m14 = _mm256_insertf128_ps(m14, *bgr++, 1);
        m25 = _mm256_insertf128_ps(m25, *bgr++, 1);

        __m256 bg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2, 1, 3, 2)); // b と g の上部分
        __m256 gr = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1, 0, 2, 1)); // g と r の下部分
        __m256 bb = _mm256_shuffle_ps(m03, bg,  _MM_SHUFFLE(2, 0, 3, 0));
        __m256 gg = _mm256_shuffle_ps(gr, bg,   _MM_SHUFFLE(3, 1, 2, 0));
        __m256 rr = _mm256_shuffle_ps(gr, m25,  _MM_SHUFFLE(3, 0, 3, 1));

        _mm256_store_ps(b, bb);
        _mm256_store_ps(g, gg);
        _mm256_store_ps(r, rr);
    }
}
Beispiel #2
0
 BOOST_FORCEINLINE
 __m256 shuffle(__m256 const lower, __m256 const upper)
 {
   return _mm256_shuffle_ps( lower, upper
                           , _MM_SHUFFLE(upper_i1, upper_i0, lower_i1, lower_i0)
                           );
 }
Beispiel #3
0
bool PaLineList0(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
    for (UINT i = 0; i < 4; ++i)
    {
        tri[0].v[i] = tri[1].v[i] = a.v[i];
        tri[2].v[i] = _mm256_shuffle_ps(a.v[i], a.v[i], _MM_SHUFFLE(2, 3, 0, 1));
    }
    SetNextPaState(pa, PaLineList0, PaLineListSingle0);
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Beispiel #4
0
Datei: main.cpp Projekt: sclc/DPP
//-----------------------------------------------------------------
// SOA -> AOS
//
//    pBlu:  b0, b1, b2, b3, b4, ...
//    pGrn:  g0, g1, g2, g3, g4, ...
//    pRed:  r0, r1, r2, r3, r4, ...
// ->
//    pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ...
//
void
soa2aos(float *pBlu, float *pGrn, float *pRed, float *pBgr, const size_t length)
{
    __m128 *bgr = (__m128 *)pBgr;

    //1回に24ユニット、8x+8y+8z、x,y,z=float
    for (size_t i = 0; i < length; i += 24)
    {
        __m256 b = _mm256_load_ps(pBlu + (i / 3));
        __m256 g = _mm256_load_ps(pGrn + (i / 3));
        __m256 r = _mm256_load_ps(pRed + (i / 3));

        __m256 bg = _mm256_shuffle_ps(b, g, _MM_SHUFFLE(2, 0, 2, 0));
        __m256 gr = _mm256_shuffle_ps(g, r, _MM_SHUFFLE(3, 1, 3, 1));
        __m256 rb = _mm256_shuffle_ps(r, b, _MM_SHUFFLE(3, 1, 2, 0));

        __m256 r03 = _mm256_shuffle_ps(bg, rb, _MM_SHUFFLE(2, 0, 2, 0));
        __m256 r14 = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0));
        __m256 r25 = _mm256_shuffle_ps(rb, gr, _MM_SHUFFLE(3, 1, 3, 1));

        *bgr++ = _mm256_castps256_ps128(r03);
        *bgr++ = _mm256_castps256_ps128(r14);
        *bgr++ = _mm256_castps256_ps128(r25);
        *bgr++ = _mm256_extractf128_ps(r03, 1);
        *bgr++ = _mm256_extractf128_ps(r14, 1);
        *bgr++ = _mm256_extractf128_ps(r25, 1);
    }
}
static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout )
{
#if 0 // AVX1
    __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] );
    __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] );
    __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
    __m256 c0 = _mm256_unpacklo_ps( b0, b1 );
    __m256 c1 = _mm256_unpackhi_ps( b0, b1 );
    vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
#else // AVX2
    static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 };
    static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 };
    const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) );
    const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) );
    __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 );
    __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 );
    vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC );
    vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E );
#endif
}
Beispiel #6
0
void static
avx_test (void)
{
  union256 u, s1, s2;
  float e [8];

  s1.x = _mm256_set_ps (1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8);
  s2.x = _mm256_set_ps (2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8);
  u.x = _mm256_shuffle_ps (s1.x, s2.x, MASK);


  e[0] = select4(s1.a,   (MASK >> 0) & 0x3);
  e[1] = select4(s1.a,   (MASK >> 2) & 0x3);
  e[2] = select4(s2.a,   (MASK >> 4) & 0x3);
  e[3] = select4(s2.a,   (MASK >> 6) & 0x3);
  e[4] = select4(s1.a+4, (MASK >> 0) & 0x3);
  e[5] = select4(s1.a+4, (MASK >> 2) & 0x3);
  e[6] = select4(s2.a+4, (MASK >> 4) & 0x3);
  e[7] = select4(s2.a+4, (MASK >> 6) & 0x3);

  if (check_union256 (u, e))
    abort ();
}
Beispiel #7
0
INLINE avxb shuffle(const avxb& a, const avxb& b) {
  return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
Beispiel #8
0
 template<index_t index_0, index_t index_1, index_t index_2, index_t index_3> INLINE const avxi shuffle( const avxi& a, const avxi& b ) {
   return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
void animate()
{
	float mx;
	float my;
	if(ManualControl)
	{
		POINT pos;
		GetCursorPos(&pos);
		RECT rc;
		GetClientRect(hMainWnd, &rc);
		ScreenToClient(hMainWnd, &pos);

		mx = pos.x;
		my = pos.y;
	}
	else
	{
		UpdatePosition(mx, my);
	}


	const auto size = partCount;

	VertexData *pVertexBuffer;
	pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD);

	_mm256_zeroall();

#pragma omp parallel \
	shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size)
	{
#pragma omp for nowait
		for(int i = 0; i < size; i += 4)
		{
			float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my };

			float *particleCoordsVec = (float*)particlesCoord + i;
			float *velocityVec = (float*)particlesVel + i;

			auto xyCoord = _mm256_loadu_ps(particleCoordsVec);
			auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec));

			auto squares = _mm256_mul_ps(hwTempData, hwTempData);
			auto distSquare = _mm256_hadd_ps(squares, squares);
			distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50);

			auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare);

			if(distSquare.m256_f32[0] < 400)
			{
				theForce.m256_f32[0] = 0;
				theForce.m256_f32[1] = 0;
			}

			if(distSquare.m256_f32[2] < 400)
			{
				theForce.m256_f32[2] = 0;
				theForce.m256_f32[3] = 0;
			}
			if(distSquare.m256_f32[4] < 400)
			{
				theForce.m256_f32[4] = 0;
				theForce.m256_f32[5] = 0;
			}

			if(distSquare.m256_f32[6] < 400)
			{
				theForce.m256_f32[6] = 0;
				theForce.m256_f32[7] = 0;
			}

			auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce);

			auto xyVelocities = _mm256_loadu_ps(velocityVec);
			xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance));
			xyVelocities = _mm256_add_ps(xyVelocities, xyForces);

			xyCoord = _mm256_add_ps(xyCoord, xyVelocities);

			_mm256_storeu_ps(velocityVec, xyVelocities);
			_mm256_storeu_ps(particleCoordsVec, xyCoord);


			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]);

			pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x;
			pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y;
			pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x;
			pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y;
			pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x;
			pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y;
			pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x;
			pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y;
		}
	}
	pVertexObject->Unlock();

	_mm256_zeroall();
}
__m256 x(__m256 a, __m256 b) {
  // Check if the mask is correct
  // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
  return _mm256_shuffle_ps(a, b, 203);
}