void PaLineStrip0Common(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.cur, slot); for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; // index 0 simdvector &v0 = tri[0]; // 01234 -> 01122334 __m128 vLow = _mm256_extractf128_ps(a0, 0); __m128 vHigh = _mm256_extractf128_ps(a0, 1); // 0123 -> 0112 // 0123 -> 233x __m128 vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(2, 1, 1, 0)); __m128 vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 3, 2)); float f; _MM_EXTRACT_FLOAT(f, vHigh, 0); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0); v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0); v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1); // index 1 is same as index 0, but position needs to be adjusted to bloat the line // into 2 tris 1 pixel wide simdvector &v1 = tri[1]; v1[i] = v0[i]; // index 2 // 01234 -> 10213243 simdvector &v2 = tri[2]; // 0123 -> 1021 // 0123 -> 32x3 vOutLow = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(1, 2, 0, 1)); vOutHigh = _mm_shuffle_ps(vLow, vLow, _MM_SHUFFLE(3, 3, 2, 3)); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0); v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0); v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1); } }
void PaLineStrip1Common(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, pa.prev, slot); simdvector &b = PaGetSimdVector(pa, pa.cur, slot); for (int i = 0; i < 4; ++i) { simdscalar a0 = a[i]; simdscalar b0 = b[i]; // index 0 simdvector &v0 = tri[0]; // 45670 -> 45566770 __m128 vPrevHigh = _mm256_extractf128_ps(a0, 1); __m128 vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(2, 1, 1, 0)); __m128 vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 3, 2)); __m128 vCurLow = _mm256_extractf128_ps(b0, 0); float f; _MM_EXTRACT_FLOAT(f, vCurLow, 0); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xf0); v0[i] = _mm256_insertf128_ps(v0[i], vOutLow, 0); v0[i] = _mm256_insertf128_ps(v0[i], vOutHigh, 1); // index 1 // 45670 -> 45566770 // index 1 same as index 0 simdvector &v1 = tri[1]; v1[i] = v0[i]; // index 2 // 45670 -> 54657607 simdvector &v2 = tri[2]; vOutLow = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(1, 2, 0, 1)); vOutHigh = _mm_shuffle_ps(vPrevHigh, vPrevHigh, _MM_SHUFFLE(3, 3, 2, 3)); vOutHigh = _mm_insert_ps(vOutHigh, _mm_set1_ps(f), 0xa0); v2[i] = _mm256_insertf128_ps(v2[i], vOutLow, 0); v2[i] = _mm256_insertf128_ps(v2[i], vOutHigh, 1); } }
dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const { #if defined(SSE) if ( useSSE ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::sse::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::sse::Vec4f x( extent[0] * modelView[0] ); dp::math::sse::Vec4f y( extent[1] * modelView[1] ); dp::math::sse::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = _mm_min_ps( minValue, vectors[i].sse() ); maxValue = _mm_max_ps( maxValue, vectors[i].sse() ); } } dp::math::Vec3f minVec, maxVec; _MM_EXTRACT_FLOAT( minVec[0], minValue, 0); _MM_EXTRACT_FLOAT( minVec[1], minValue, 1); _MM_EXTRACT_FLOAT( minVec[2], minValue, 2); _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0); _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1); _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #elif defined(NEON) if ( useNEON ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() ); float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::neon::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::neon::Vec4f x( extent[0] * modelView[0] ); dp::math::neon::Vec4f y( extent[1] * modelView[1] ); dp::math::neon::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = vminq_f32( minValue, vectors[i].neon() ); maxValue = vmaxq_f32( maxValue, vectors[i].neon() ); } } dp::math::Vec3f minVec, maxVec; vst1q_lane_f32( &minVec[0], minValue, 0); vst1q_lane_f32( &minVec[1], minValue, 1); vst1q_lane_f32( &minVec[2], minValue, 2); vst1q_lane_f32( &maxVec[0], maxValue, 0); vst1q_lane_f32( &maxVec[1], maxValue, 1); vst1q_lane_f32( &maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #endif // CPU fallback { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); dp::math::Box4f boundingBox; char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride())); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::Vec4f vectors[8]; vectors[0] = (objectImpl->getLowerLeft() * modelView); dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] ); dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] ); dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { boundingBox.update( vectors[i] ); } } dp::math::Vec4f lower = boundingBox.getLower(); dp::math::Vec4f upper = boundingBox.getUpper(); return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2])); } }