Exemple #1
0
void
foo ()
{
  x = y;
  _mm256_zeroall ();
  _mm256_zeroupper ();
  _mm256_zeroupper ();
  _mm256_zeroupper ();
}
int main(int, char**)
{
    /* AVX */
    _mm256_zeroall();
    __m256i a = _mm256_setzero_si256();

    /* AVX2 */
    __m256i b = _mm256_and_si256(a, a);
    __m256i result = _mm256_add_epi8(a, b);
    (void)result;
    return 0;
}
void ImageAdd::_RunInt()
{
  const size_t cszIters = static_cast< size_t >( TEST_RUN_COUNT );

  printf("\n    Running \"_RunInt()\"  -  Image-Size = %4d x %4d\n", ciImageWidth, ciImageHeight);

  HandlerList   lstHandlers;

  AppendHandlerList( lstHandlers, ImageAddInt_GetScalarHandlers() );
  AppendHandlerList( lstHandlers, ImageAddInt_GetSSEHandlers()    );
  AppendHandlerList( lstHandlers, ImageAddInt_GetAVXHandlers()    );

  CREATE_IMAGE( InputImageType1, spInputImage1 );
  CREATE_IMAGE( InputImageType2, spInputImage2 );
  CREATE_IMAGE( OutputImageType, spOutputImage );

  ImageIO::LoadTestImage( *spInputImage1 );
  ImageIO::LoadTestImage( *spInputImage2 );

  for (auto itHandler : lstHandlers)
  {
    printf( "\n      Launch handler \"%10s\"  ->  ", itHandler->GetName().c_str() );

    double dTime = omp_get_wtime();

    for (size_t szIter = static_cast<size_t>(0); szIter < cszIters; ++szIter)
    {
      _mm256_zeroall();
      itHandler->Launch( spOutputImage, spInputImage1, spInputImage2 );
    }

    dTime = omp_get_wtime() - dTime;
    dTime = (dTime * 1000.) / static_cast< double >( cszIters );

    printf( "%10.6f ms", dTime );
  }


#ifdef CONFIG_CHECK_OUTPUT

  IMAGE_ADD_CHECK_OUTPUT( Int );

#endif


  printf("\n");
}
void TopologicalErosion::_RunFloat(unsigned int uiKernelSize)
{
  const size_t cszIters = static_cast< size_t >( TEST_RUN_COUNT );

  printf("\n    Running \"_RunFloat()\"  -  Image-Size = %4d x %4d\n", ciImageWidth, ciImageHeight);

  HandlerList   lstHandlers;

  AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetScalarHandlers() );
  AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetSSEHandlers()    );
  AppendHandlerList( lstHandlers, TopologicalErosionFloat_GetAVXHandlers()    );

  CREATE_IMAGE( InputImageType,  spInputImage  );
  CREATE_IMAGE( OutputImageType, spOutputImage );

  ImageIO::LoadTestImage( *spInputImage );

  for (auto itHandler : lstHandlers)
  {
    printf( "\n      Launch handler \"%10s\"  ->  ", itHandler->GetName().c_str() );

    double dTime = omp_get_wtime();

    for (size_t szIter = static_cast<size_t>(0); szIter < cszIters; ++szIter)
    {
      _mm256_zeroall();
      itHandler->Launch( spOutputImage, spInputImage, uiKernelSize );
    }

    dTime = omp_get_wtime() - dTime;
    dTime = (dTime * 1000.) / static_cast< double >( cszIters );

    printf( "%10.6f ms", dTime );
  }


#ifdef CONFIG_CHECK_OUTPUT

  TOPOLOGICAL_EROSION_CHECK_OUTPUT( Float );

#endif


  printf("\n");
}
  void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth])
  {
    for (int iY = 0; iY < ciHeight; ++iY)
    {
      _mm256_zeroall();

      OutputPixelType   *pOutput = aaOutput[iY];
      InputPixelType_1  *pInput1 = aaInput1[iY];
      InputPixelType_2  *pInput2 = aaInput2[iY];

      for (int iX = 0; iX < ciWidth; iX += VectorWidth)
      {
        __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX );
        __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX );

        _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) );
      }

      _mm256_zeroupper();
    }
  }
Exemple #6
0
void
foo ()
{
  x = y;
  _mm256_zeroall ();
}
void animate()
{
	float mx;
	float my;
	if(ManualControl)
	{
		POINT pos;
		GetCursorPos(&pos);
		RECT rc;
		GetClientRect(hMainWnd, &rc);
		ScreenToClient(hMainWnd, &pos);

		mx = pos.x;
		my = pos.y;
	}
	else
	{
		UpdatePosition(mx, my);
	}


	const auto size = partCount;

	VertexData *pVertexBuffer;
	pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD);

	_mm256_zeroall();

#pragma omp parallel \
	shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size)
	{
#pragma omp for nowait
		for(int i = 0; i < size; i += 4)
		{
			float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my };

			float *particleCoordsVec = (float*)particlesCoord + i;
			float *velocityVec = (float*)particlesVel + i;

			auto xyCoord = _mm256_loadu_ps(particleCoordsVec);
			auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec));

			auto squares = _mm256_mul_ps(hwTempData, hwTempData);
			auto distSquare = _mm256_hadd_ps(squares, squares);
			distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50);

			auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare);

			if(distSquare.m256_f32[0] < 400)
			{
				theForce.m256_f32[0] = 0;
				theForce.m256_f32[1] = 0;
			}

			if(distSquare.m256_f32[2] < 400)
			{
				theForce.m256_f32[2] = 0;
				theForce.m256_f32[3] = 0;
			}
			if(distSquare.m256_f32[4] < 400)
			{
				theForce.m256_f32[4] = 0;
				theForce.m256_f32[5] = 0;
			}

			if(distSquare.m256_f32[6] < 400)
			{
				theForce.m256_f32[6] = 0;
				theForce.m256_f32[7] = 0;
			}

			auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce);

			auto xyVelocities = _mm256_loadu_ps(velocityVec);
			xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance));
			xyVelocities = _mm256_add_ps(xyVelocities, xyForces);

			xyCoord = _mm256_add_ps(xyCoord, xyVelocities);

			_mm256_storeu_ps(velocityVec, xyVelocities);
			_mm256_storeu_ps(particleCoordsVec, xyCoord);


			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]);

			pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x;
			pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y;
			pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x;
			pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y;
			pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x;
			pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y;
			pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x;
			pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y;
		}
	}
	pVertexObject->Unlock();

	_mm256_zeroall();
}