static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout ) { #if 0 // AVX1 __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] ); __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] ); __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); __m256 c0 = _mm256_unpacklo_ps( b0, b1 ); __m256 c1 = _mm256_unpackhi_ps( b0, b1 ); vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); #else // AVX2 static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 }; static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 }; const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) ); const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) ); __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 ); __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 ); vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC ); vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E ); #endif }
bool PaPoints0(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.cur, slot); for (UINT i = 0; i < 4; ++i) { __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3 tri[0].v[i] = tri[1].v[i] = tri[2].v[i] = vCombined; } SetNextPaState(pa, PaPoints1, PaPointsSingle0, 1); pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
INLINE avxi unpackhi( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
void PaTriStripSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3]) { simdvector &a = PaGetSimdVector(pa, pa.prev, slot); simdvector &b = PaGetSimdVector(pa, pa.cur, slot); simdscalar tmp0; simdscalar tmp1; // Convert from vertical to horizontal. switch (triIndex) { case 0: // Grab vertex 0 from lane 0 and store it in tri[0] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 1 and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 2 and store it in tri[2] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); break; case 1: // Grab vertex 2 from lane 2 and store it in tri[2] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 1 and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 0 from lane 3 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); break; case 2: // Grab vertex 2 from lane 2 and store it in tri[2] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 0 from lane 3 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 4 from 'a' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); break; case 3: // Grab vertex 1 from lane 4 from 'a' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 0 from lane 3 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 5 from 'a' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); break; case 4: // Grab vertex 1 from lane 4 from 'a' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 5 from 'a' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 0 from lane 6 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); break; case 5: // Grab vertex 0 from lane 6 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 5 from 'a' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 7 from 'a' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); break; case 6: // Grab vertex 0 from lane 6 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 7 from 'a' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 0 from 'b' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); break; case 7: // Grab vertex 2 from lane 0 from 'b' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 7 from 'a' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 0 from lane 1 from 'b' and store it in tri[0] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); break; }; }
INLINE __m128 swizzleLane7(simdvector &a) { simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); }
void PaTriListSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3]) { // We have 12 simdscalars contained within 3 simdvectors which // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. simdvector &a = PaGetSimdVector(pa, 0, slot); simdvector &b = PaGetSimdVector(pa, 1, slot); simdvector &c = PaGetSimdVector(pa, 2, slot); simdscalar tmp0; simdscalar tmp1; // Convert from vertical to horizontal. switch (triIndex) { case 0: // Grab vertex 0 from lane 0 and store it in tri[0] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 1 and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 2 and store it in tri[2] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); break; case 1: // Grab vertex 0 from lane 3 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 4 from 'a' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 5 from 'a' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(a.x, a.z); tmp1 = _mm256_unpacklo_ps(a.y, a.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); break; case 2: // Grab vertex 0 from lane 6 from 'a' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 7 from 'a' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(a.x, a.z); tmp1 = _mm256_unpackhi_ps(a.y, a.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 0 from 'b' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); break; case 3: // Grab vertex 0 from lane 1 from 'b' and store it in tri[0] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 2 from 'b' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(b.x, b.z); tmp1 = _mm256_unpackhi_ps(b.y, b.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 3 from 'b' and store it in tri[2] tmp0 = _mm256_unpackhi_ps(b.x, b.z); tmp1 = _mm256_unpackhi_ps(b.y, b.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); break; case 4: // Grab vertex 0 from lane 4 from 'b' and store it in tri[0] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 5 from 'b' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(b.x, b.z); tmp1 = _mm256_unpacklo_ps(b.y, b.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 6 from 'b' and store it in tri[2] tmp0 = _mm256_unpackhi_ps(b.x, b.z); tmp1 = _mm256_unpackhi_ps(b.y, b.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); break; case 5: // Grab vertex 0 from lane 7 from 'b' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(b.x, b.z); tmp1 = _mm256_unpackhi_ps(b.y, b.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 0 from 'c' and store it in tri[1] tmp0 = _mm256_unpacklo_ps(c.x, c.z); tmp1 = _mm256_unpacklo_ps(c.y, c.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 1 from 'c' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(c.x, c.z); tmp1 = _mm256_unpacklo_ps(c.y, c.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); break; case 6: // Grab vertex 0 from lane 2 from 'c' and store it in tri[0] tmp0 = _mm256_unpackhi_ps(c.x, c.z); tmp1 = _mm256_unpackhi_ps(c.y, c.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); // Grab vertex 1 from lane 3 from 'c' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(c.x, c.z); tmp1 = _mm256_unpackhi_ps(c.y, c.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); // Grab vertex 2 from lane 4 from 'c' and store it in tri[2] tmp0 = _mm256_unpacklo_ps(c.x, c.z); tmp1 = _mm256_unpacklo_ps(c.y, c.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); break; case 7: // Grab vertex 0 from lane 5 from 'c' and store it in tri[0] tmp0 = _mm256_unpacklo_ps(c.x, c.z); tmp1 = _mm256_unpacklo_ps(c.y, c.w); triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); // Grab vertex 1 from lane 6 from 'c' and store it in tri[1] tmp0 = _mm256_unpackhi_ps(c.x, c.z); tmp1 = _mm256_unpackhi_ps(c.y, c.w); triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); // Grab vertex 2 from lane 7 from 'c' and store it in tri[2] tmp0 = _mm256_unpackhi_ps(c.x, c.z); tmp1 = _mm256_unpackhi_ps(c.y, c.w); triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); break; }; }