void foo () { x = y; _mm256_zeroall (); _mm256_zeroupper (); _mm256_zeroupper (); _mm256_zeroupper (); }
int main(int, char**) { /* AVX */ _mm256_zeroall(); __m256i a = _mm256_setzero_si256(); /* AVX2 */ __m256i b = _mm256_and_si256(a, a); __m256i result = _mm256_add_epi8(a, b); (void)result; return 0; }
void ImageAdd::_RunInt() { const size_t cszIters = static_cast< size_t >( TEST_RUN_COUNT ); printf("\n Running \"_RunInt()\" - Image-Size = %4d x %4d\n", ciImageWidth, ciImageHeight); HandlerList lstHandlers; AppendHandlerList( lstHandlers, ImageAddInt_GetScalarHandlers() ); AppendHandlerList( lstHandlers, ImageAddInt_GetSSEHandlers() ); AppendHandlerList( lstHandlers, ImageAddInt_GetAVXHandlers() ); CREATE_IMAGE( InputImageType1, spInputImage1 ); CREATE_IMAGE( InputImageType2, spInputImage2 ); CREATE_IMAGE( OutputImageType, spOutputImage ); ImageIO::LoadTestImage( *spInputImage1 ); ImageIO::LoadTestImage( *spInputImage2 ); for (auto itHandler : lstHandlers) { printf( "\n Launch handler \"%10s\" -> ", itHandler->GetName().c_str() ); double dTime = omp_get_wtime(); for (size_t szIter = static_cast<size_t>(0); szIter < cszIters; ++szIter) { _mm256_zeroall(); itHandler->Launch( spOutputImage, spInputImage1, spInputImage2 ); } dTime = omp_get_wtime() - dTime; dTime = (dTime * 1000.) / static_cast< double >( cszIters ); printf( "%10.6f ms", dTime ); } #ifdef CONFIG_CHECK_OUTPUT IMAGE_ADD_CHECK_OUTPUT( Int ); #endif printf("\n"); }
void TopologicalErosion::_RunFloat(unsigned int uiKernelSize) { const size_t cszIters = static_cast< size_t >( TEST_RUN_COUNT ); printf("\n Running \"_RunFloat()\" - Image-Size = %4d x %4d\n", ciImageWidth, ciImageHeight); HandlerList lstHandlers; AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetScalarHandlers() ); AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetSSEHandlers() ); AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetAVXHandlers() ); CREATE_IMAGE( InputImageType, spInputImage ); CREATE_IMAGE( OutputImageType, spOutputImage ); ImageIO::LoadTestImage( *spInputImage ); for (auto itHandler : lstHandlers) { printf( "\n Launch handler \"%10s\" -> ", itHandler->GetName().c_str() ); double dTime = omp_get_wtime(); for (size_t szIter = static_cast<size_t>(0); szIter < cszIters; ++szIter) { _mm256_zeroall(); itHandler->Launch( spOutputImage, spInputImage, uiKernelSize ); } dTime = omp_get_wtime() - dTime; dTime = (dTime * 1000.) / static_cast< double >( cszIters ); printf( "%10.6f ms", dTime ); } #ifdef CONFIG_CHECK_OUTPUT TOPOLOGICAL_EROSION_CHECK_OUTPUT( Float ); #endif printf("\n"); }
void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth]) { for (int iY = 0; iY < ciHeight; ++iY) { _mm256_zeroall(); OutputPixelType *pOutput = aaOutput[iY]; InputPixelType_1 *pInput1 = aaInput1[iY]; InputPixelType_2 *pInput2 = aaInput2[iY]; for (int iX = 0; iX < ciWidth; iX += VectorWidth) { __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX ); __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX ); _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) ); } _mm256_zeroupper(); } }
void foo () { x = y; _mm256_zeroall (); }
void animate() { float mx; float my; if(ManualControl) { POINT pos; GetCursorPos(&pos); RECT rc; GetClientRect(hMainWnd, &rc); ScreenToClient(hMainWnd, &pos); mx = pos.x; my = pos.y; } else { UpdatePosition(mx, my); } const auto size = partCount; VertexData *pVertexBuffer; pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD); _mm256_zeroall(); #pragma omp parallel \ shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size) { #pragma omp for nowait for(int i = 0; i < size; i += 4) { float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my }; float *particleCoordsVec = (float*)particlesCoord + i; float *velocityVec = (float*)particlesVel + i; auto xyCoord = _mm256_loadu_ps(particleCoordsVec); auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec)); auto squares = _mm256_mul_ps(hwTempData, hwTempData); auto distSquare = _mm256_hadd_ps(squares, squares); distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50); auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare); if(distSquare.m256_f32[0] < 400) { theForce.m256_f32[0] = 0; theForce.m256_f32[1] = 0; } if(distSquare.m256_f32[2] < 400) { theForce.m256_f32[2] = 0; theForce.m256_f32[3] = 0; } if(distSquare.m256_f32[4] < 400) { theForce.m256_f32[4] = 0; theForce.m256_f32[5] = 0; } if(distSquare.m256_f32[6] < 400) { theForce.m256_f32[6] = 0; theForce.m256_f32[7] = 0; } auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce); auto xyVelocities = _mm256_loadu_ps(velocityVec); xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance)); xyVelocities = _mm256_add_ps(xyVelocities, xyForces); xyCoord = _mm256_add_ps(xyCoord, xyVelocities); _mm256_storeu_ps(velocityVec, xyVelocities); _mm256_storeu_ps(particleCoordsVec, xyCoord); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]); pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x; pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y; pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x; pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y; pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x; pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y; pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x; pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y; } } pVertexObject->Unlock(); _mm256_zeroall(); }