static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout )
{
#if 0 // AVX1
    __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] );
    __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] );
    __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
    __m256 c0 = _mm256_unpacklo_ps( b0, b1 );
    __m256 c1 = _mm256_unpackhi_ps( b0, b1 );
    vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
#else // AVX2
    static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 };
    static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 };
    const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) );
    const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) );
    __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 );
    __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 );
    vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC );
    vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E );
#endif
}
Example #2
0
bool PaPoints0(PA_STATE &pa, UINT slot, simdvector tri[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);

    for (UINT i = 0; i < 4; ++i)
    {
        __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]);                // 0 0 1 1 4 4 5 5
        __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]);               // 2 2 3 3 6 6 7 7
        __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3

        tri[0].v[i] = tri[1].v[i] = tri[2].v[i] = vCombined;
    }

    SetNextPaState(pa, PaPoints1, PaPointsSingle0, 1);
    pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH;
    return true;
}
Example #3
0
void nibble_sort_beekman1(uint64_t *buf) {
  // already in the right order
  //__m256i
  // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL};
  __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL,
                   0x0e060c040a020800ULL, 0x0f070d050b030901ULL};
  __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL,
                   0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL};
  // use less instructions below
  //__m256i
  // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL};
  __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL,
                   0x000d070605040301ULL, 0x0f0e0b090a080c02ULL};
  __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL,
                   0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL};
  __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL,
                   0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL};
  __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL,
                   0x070500060b030901ULL, 0x0f0d080e0c040a02ULL};
  __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL,
                   0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL};
  __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL,
                   0x070c0b0a09080605ULL, 0x0f0e04030201000dULL};
  __m256i nibblemask = _mm256_set1_epi8(0x0f);
  for (uint32_t i = 0; i < (1024 / 4); i += 1) {
    __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2;
    r0 &= nibblemask;
    r1 ^= r0;
    r1 = _mm256_srli_epi64(r1, 4);

#define sort_and_shuffle(n)                                                    \
  r2 = _mm256_max_epi8(r0, r1);                                                \
  r0 = _mm256_min_epi8(r0, r1);                                                \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000);           \
  r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111);           \
  r1 = _mm256_shuffle_epi8(r1, shuf##n);                                       \
  r2 = _mm256_shuffle_epi8(r2, shuf##n);                                       \
  r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);           \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111)

    sort_and_shuffle(1);
    sort_and_shuffle(2);
    { // sort_and_shuffle(3);
      r2 = _mm256_max_epi8(r0, r1);
      r0 = _mm256_min_epi8(r0, r1);
      r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2);
      r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2);
      r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111);
      r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);
    }
    sort_and_shuffle(4);
    sort_and_shuffle(5);
    sort_and_shuffle(6);
    sort_and_shuffle(7);
    sort_and_shuffle(8);
    sort_and_shuffle(9);

    r1 = _mm256_slli_epi64(r1, 4);
    _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0);
  }
}
Example #4
0
 INLINE avxi unpackhi( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
Example #5
0
void PaTriStripSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3])
{
    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
    simdscalar tmp0;
    simdscalar tmp1;

    // Convert from vertical to horizontal.
    switch (triIndex)
    {
    case 0:
        // Grab vertex 0 from lane 0 and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 1:
        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    case 2:
        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
        break;
    case 3:
        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
        break;

    case 4:
        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
        break;
    case 5:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        break;
    case 6:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
        break;
    case 7:
        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 0 from lane 1 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    };
}
Example #6
0
INLINE __m128 swizzleLane7(simdvector &a)
{
    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
Example #7
0
void PaTriListSingle0(PA_STATE &pa, UINT slot, UINT triIndex, __m128 triverts[3])
{
    // We have 12 simdscalars contained within 3 simdvectors which
    // hold at least 8 triangles worth of data. We want to assemble a single
    // triangle with data in horizontal form.
    simdvector &a = PaGetSimdVector(pa, 0, slot);
    simdvector &b = PaGetSimdVector(pa, 1, slot);
    simdvector &c = PaGetSimdVector(pa, 2, slot);
    simdscalar tmp0;
    simdscalar tmp1;

    // Convert from vertical to horizontal.
    switch (triIndex)
    {
    case 0:
        // Grab vertex 0 from lane 0 and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 1 and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 2 and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 1:
        // Grab vertex 0 from lane 3 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 4 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 5 from 'a' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(a.x, a.z);
        tmp1 = _mm256_unpacklo_ps(a.y, a.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        break;
    case 2:
        // Grab vertex 0 from lane 6 from 'a' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 7 from 'a' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(a.x, a.z);
        tmp1 = _mm256_unpackhi_ps(a.y, a.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 0 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        break;
    case 3:
        // Grab vertex 0 from lane 1 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 2 from 'b' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 3 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        break;

    case 4:
        // Grab vertex 0 from lane 4 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 5 from 'b' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(b.x, b.z);
        tmp1 = _mm256_unpacklo_ps(b.y, b.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 6 from 'b' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        break;
    case 5:
        // Grab vertex 0 from lane 7 from 'b' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(b.x, b.z);
        tmp1 = _mm256_unpackhi_ps(b.y, b.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 0 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 1 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
        break;
    case 6:
        // Grab vertex 0 from lane 2 from 'c' and store it in tri[0]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);

        // Grab vertex 1 from lane 3 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);

        // Grab vertex 2 from lane 4 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        break;
    case 7:
        // Grab vertex 0 from lane 5 from 'c' and store it in tri[0]
        tmp0 = _mm256_unpacklo_ps(c.x, c.z);
        tmp1 = _mm256_unpacklo_ps(c.y, c.w);
        triverts[0] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);

        // Grab vertex 1 from lane 6 from 'c' and store it in tri[1]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[1] = _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);

        // Grab vertex 2 from lane 7 from 'c' and store it in tri[2]
        tmp0 = _mm256_unpackhi_ps(c.x, c.z);
        tmp1 = _mm256_unpackhi_ps(c.y, c.w);
        triverts[2] = _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
        break;
    };
}