Exemplo n.º 1
0
void RendererGPUBenchmark(FRHICommandListImmediate& RHICmdList, FSynthBenchmarkResults& InOut, const FSceneView& View, float WorkScale, bool bDebugOut)
{
	check(IsInRenderingThread());

	// two RT to ping pong so we force the GPU to flush it's pipeline
	TRefCountPtr<IPooledRenderTarget> RTItems[3];
	{
		FPooledRenderTargetDesc Desc(FPooledRenderTargetDesc::Create2DDesc(FIntPoint(GBenchmarkResolution, GBenchmarkResolution), PF_B8G8R8A8, FClearValueBinding::None, TexCreate_None, TexCreate_RenderTargetable | TexCreate_ShaderResource, false));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[0], TEXT("Benchmark0"));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[1], TEXT("Benchmark1"));

		Desc.Extent = FIntPoint(1, 1);
		Desc.Flags = TexCreate_CPUReadback;	// needs TexCreate_ResolveTargetable?
		Desc.TargetableFlags = TexCreate_None;

		GRenderTargetPool.FindFreeElement(Desc, RTItems[2], TEXT("BenchmarkReadback"));
	}

	// set the state
	RHICmdList.SetBlendState(TStaticBlendState<>::GetRHI());
	RHICmdList.SetRasterizerState(TStaticRasterizerState<>::GetRHI());
	RHICmdList.SetDepthStencilState(TStaticDepthStencilState<false,CF_Always>::GetRHI());

	{
		// larger number means more accuracy but slower, some slower GPUs might timeout with a number to large
		const uint32 IterationCount = 70;
		const uint32 MethodCount = ARRAY_COUNT(InOut.GPUStats);

		// 0 / 1
		uint32 DestRTIndex = 0;

		const uint32 TimerSampleCount = IterationCount * MethodCount + 1;

		static FRenderQueryRHIRef TimerQueries[TimerSampleCount];
		static float LocalWorkScale[IterationCount];

		for(uint32  i = 0; i < TimerSampleCount; ++i)
		{
			TimerQueries[i] = GTimerQueryPool.AllocateQuery();
		}

		const bool bSupportsTimerQueries = (TimerQueries[0] != NULL);
		if(!bSupportsTimerQueries)
		{
			UE_LOG(LogSynthBenchmark, Warning, TEXT("GPU driver does not support timer queries."));

			// Temporary workaround for GL_TIMESTAMP being unavailable and GL_TIME_ELAPSED workaround breaking drivers
#if PLATFORM_MAC
			GLint RendererID = 0;
			float PerfScale = 1.0f;
			[[NSOpenGLContext currentContext] getValues:&RendererID forParameter:NSOpenGLCPCurrentRendererID];
			{
				switch((RendererID & kCGLRendererIDMatchingMask))
				{
					case kCGLRendererATIRadeonX4000ID: // AMD 7xx0 & Dx00 series - should be pretty beefy
						PerfScale = 1.2f;
						break;
					case kCGLRendererATIRadeonX3000ID: // AMD 5xx0, 6xx0 series - mostly OK
					case kCGLRendererGeForceID: // Nvidia 6x0 & 7x0 series - mostly OK
						PerfScale = 2.0f;
						break;
					case kCGLRendererIntelHD5000ID: // Intel HD 5000, Iris, Iris Pro - not dreadful
						PerfScale = 4.2f;
						break;
					case kCGLRendererIntelHD4000ID: // Intel HD 4000 - quite slow
						PerfScale = 7.5f;
						break;
					case kCGLRendererATIRadeonX2000ID: // ATi 4xx0, 3xx0, 2xx0 - almost all very slow and drivers are now very buggy
					case kCGLRendererGeForce8xxxID: // Nvidia 3x0, 2x0, 1x0, 9xx0, 8xx0 - almost all very slow
					case kCGLRendererIntelHDID: // Intel HD 3000 - very, very slow and very buggy driver
					default:
						PerfScale = 10.0f;
						break;
				}
			}
			
			InOut.GPUStats[0] = FSynthBenchmarkStat(TEXT("ALUHeavyNoise"), 1.0f / 4.601f, TEXT("s/GigaPix"));
			InOut.GPUStats[1] = FSynthBenchmarkStat(TEXT("TexHeavy"), 1.0f / 7.447f, TEXT("s/GigaPix"));
			InOut.GPUStats[2] = FSynthBenchmarkStat(TEXT("DepTexHeavy"), 1.0f / 3.847f, TEXT("s/GigaPix"));
			InOut.GPUStats[3] = FSynthBenchmarkStat(TEXT("FillOnly"), 1.0f / 25.463f, TEXT("s/GigaPix"));
			InOut.GPUStats[4] = FSynthBenchmarkStat(TEXT("Bandwidth"), 1.0f / 1.072f, TEXT("s/GigaPix"));
			InOut.GPUStats[0].SetMeasuredTime( FTimeSample(PerfScale, PerfScale * (1.0f / 4.601f)) );
			InOut.GPUStats[1].SetMeasuredTime( FTimeSample(PerfScale, PerfScale * (1.0f / 7.447f)) );
			InOut.GPUStats[2].SetMeasuredTime( FTimeSample(PerfScale, PerfScale * (1.0f / 3.847f)) );
			InOut.GPUStats[3].SetMeasuredTime( FTimeSample(PerfScale, PerfScale * (1.0f / 25.463f)) );
			InOut.GPUStats[4].SetMeasuredTime( FTimeSample(PerfScale, PerfScale * (1.0f / 1.072f)) );
#endif
			return;
		}

		// TimingValues are in Seconds
		FTimingSeries TimingSeries[MethodCount];
		// in 1/1000000 Seconds
		uint64 TotalTimes[MethodCount];
		
		for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
		{
			TotalTimes[MethodIterator] = 0;
			TimingSeries[MethodIterator].Init(IterationCount);
		}

		check(MethodCount == 5);
		InOut.GPUStats[0] = FSynthBenchmarkStat(TEXT("ALUHeavyNoise"), 1.0f / 4.601f, TEXT("s/GigaPix"));
		InOut.GPUStats[1] = FSynthBenchmarkStat(TEXT("TexHeavy"), 1.0f / 7.447f, TEXT("s/GigaPix"));
		InOut.GPUStats[2] = FSynthBenchmarkStat(TEXT("DepTexHeavy"), 1.0f / 3.847f, TEXT("s/GigaPix"));
		InOut.GPUStats[3] = FSynthBenchmarkStat(TEXT("FillOnly"), 1.0f / 25.463f, TEXT("s/GigaPix"));
		InOut.GPUStats[4] = FSynthBenchmarkStat(TEXT("Bandwidth"), 1.0f / 1.072f, TEXT("s/GigaPix"));

		// e.g. on NV670: Method3 (mostly fill rate )-> 26GP/s (seems realistic)
		// reference: http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units theoretical: 29.3G/s

		RHICmdList.EndRenderQuery(TimerQueries[0]);

		// multiple iterations to see how trust able the values are
		for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
		{
			for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
			{
				// alternate between forward and backward (should give the same number)
				//			uint32 MethodId = (Iteration % 2) ? MethodIterator : (MethodCount - 1 - MethodIterator);
				uint32 MethodId = MethodIterator;

				uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

				// 0 / 1
				const uint32 SrcRTIndex = 1 - DestRTIndex;

				GRenderTargetPool.VisualizeTexture.SetCheckPoint(RHICmdList, RTItems[DestRTIndex]);

				SetRenderTarget(RHICmdList, RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, FTextureRHIRef());	

				// decide how much work we do in this pass
				LocalWorkScale[Iteration] = (Iteration / 10.f + 1.f) * WorkScale;

				RunBenchmarkShader(RHICmdList, View, MethodId, RTItems[SrcRTIndex], LocalWorkScale[Iteration]);

				RHICmdList.CopyToResolveTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, RTItems[DestRTIndex]->GetRenderTargetItem().ShaderResourceTexture, false, FResolveParams());

				/*if(bGPUCPUSync)
				{
					// more consistent timing but strangely much faster to the level that is unrealistic

					FResolveParams Param;

					Param.Rect = FResolveRect(0, 0, 1, 1);
					RHICmdList.CopyToResolveTarget(
						RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture,
						RTItems[2]->GetRenderTargetItem().ShaderResourceTexture,
						false,
						Param);

					void* Data = 0;
					int Width = 0;
					int Height = 0;

					RHIMapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture, Data, Width, Height);
					RHIUnmapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture);
				}*/

				RHICmdList.EndRenderQuery(TimerQueries[QueryIndex]);

				// ping pong
				DestRTIndex = 1 - DestRTIndex;
			}
		}

		{
			uint64 OldAbsTime = 0;
			// flushes the RHI thread to make sure all RHICmdList.EndRenderQuery() commands got executed.
			RHICmdList.ImmediateFlush(EImmediateFlushType::FlushRHIThread);
			RHICmdList.GetRenderQueryResult(TimerQueries[0], OldAbsTime, true);
			GTimerQueryPool.ReleaseQuery(RHICmdList, TimerQueries[0]);

			for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
			{
				uint32 Results[MethodCount];

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

					uint64 AbsTime;
					RHICmdList.GetRenderQueryResult(TimerQueries[QueryIndex], AbsTime, true);
					GTimerQueryPool.ReleaseQuery(RHICmdList, TimerQueries[QueryIndex]);

					uint64 RelTime = AbsTime - OldAbsTime; 

					TotalTimes[MethodId] += RelTime;
					Results[MethodId] = RelTime;

					OldAbsTime = AbsTime;
				}

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					float TimeInSec = Results[MethodId] / 1000000.0f;

					// to normalize from seconds to seconds per GPixel
					float SamplesInGPix = LocalWorkScale[Iteration] * GBenchmarkResolution * GBenchmarkResolution / 1000000000.0f;

					// TimingValue in Seconds per GPixel
					TimingSeries[MethodId].SetEntry(Iteration, TimeInSec / SamplesInGPix);
				}
			}

			if(bSupportsTimerQueries)
			{
				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					float Confidence = 0.0f;
					// in seconds per GPixel
					float NormalizedTime = TimingSeries[MethodId].ComputeValue(Confidence);

					if(Confidence > 0)
					{
						FTimeSample TimeSample(TotalTimes[MethodId] / 1000000.0f, NormalizedTime);

						InOut.GPUStats[MethodId].SetMeasuredTime(TimeSample, Confidence);
					}
				}
			}
		}
	}
Exemplo n.º 2
0
void RendererGPUBenchmark(FSynthBenchmarkResults& InOut, const FSceneView& View, uint32 WorkScale, bool bDebugOut)
{
	check(IsInRenderingThread());
	
	// two RT to ping pong so we force the GPU to flush it's pipeline
	TRefCountPtr<IPooledRenderTarget> RTItems[3];
	{
		FPooledRenderTargetDesc Desc(FPooledRenderTargetDesc::Create2DDesc(FIntPoint(GBenchmarkResolution, GBenchmarkResolution), PF_B8G8R8A8, TexCreate_None, TexCreate_RenderTargetable | TexCreate_ShaderResource, false));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[0], TEXT("Benchmark0"));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[1], TEXT("Benchmark1"));

		Desc.Extent = FIntPoint(1, 1);
		Desc.Flags = TexCreate_CPUReadback;	// needs TexCreate_ResolveTargetable?
		Desc.TargetableFlags = TexCreate_None;

		GRenderTargetPool.FindFreeElement(Desc, RTItems[2], TEXT("BenchmarkReadback"));
	}

	// set the state
	RHISetBlendState(TStaticBlendState<>::GetRHI());
	RHISetRasterizerState(TStaticRasterizerState<>::GetRHI());
	RHISetDepthStencilState(TStaticDepthStencilState<false,CF_Always>::GetRHI());

	{
		// larger number means more accuracy but slower, some slower GPUs might timeout with a number to large
		const uint32 IterationCount = 70;
		const uint32 MethodCount = ARRAY_COUNT(InOut.GPUStats);

		// 0 / 1
		uint32 DestRTIndex = 0;

		const uint32 TimerSampleCount = IterationCount * MethodCount + 1;

		static FRenderQueryRHIRef TimerQueries[TimerSampleCount];
		static uint32 PassCount[IterationCount];

		for(uint32  i = 0; i < TimerSampleCount; ++i)
		{
			TimerQueries[i] = GTimerQueryPool.AllocateQuery();
		}

		if(!TimerQueries[0])
		{
			UE_LOG(LogSynthBenchmark, Warning, TEXT("GPU driver does not support timer queries."));
		}

		// TimingValues are in Seconds per GPixel
		FTimingSeries TimingSeries[MethodCount];
		
		for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
		{
			TimingSeries[MethodIterator].Init(IterationCount);
		}

		check(MethodCount == 5);
		InOut.GPUStats[0] = FSynthBenchmarkStat(TEXT("ALUHeavyNoise"), 1.0f / 4.601f, TEXT("s/GigaPix"));
		InOut.GPUStats[1] = FSynthBenchmarkStat(TEXT("TexHeavy"), 1.0f / 7.447f, TEXT("s/GigaPix"));
		InOut.GPUStats[2] = FSynthBenchmarkStat(TEXT("DepTexHeavy"), 1.0f / 3.847f, TEXT("s/GigaPix"));
		InOut.GPUStats[3] = FSynthBenchmarkStat(TEXT("FillOnly"), 1.0f / 25.463f, TEXT("s/GigaPix"));
		InOut.GPUStats[4] = FSynthBenchmarkStat(TEXT("Bandwidth"), 1.0f / 1.072f, TEXT("s/GigaPix"));

		// e.g. on NV670: Method3 (mostly fill rate )-> 26GP/s (seems realistic)
		// reference: http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units theoretical: 29.3G/s

		RHIEndRenderQuery(TimerQueries[0]);

		// multiple iterations to see how trust able the values are
		for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
		{
			for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
			{
				// alternate between forward and backward (should give the same number)
				//			uint32 MethodId = (Iteration % 2) ? MethodIterator : (MethodCount - 1 - MethodIterator);
				uint32 MethodId = MethodIterator;

				uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

				// 0 / 1
				const uint32 SrcRTIndex = 1 - DestRTIndex;

				GRenderTargetPool.VisualizeTexture.SetCheckPoint(RTItems[DestRTIndex]);

				RHISetRenderTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, FTextureRHIRef());	

				// decide how much work we do in this pass
				PassCount[Iteration] = (Iteration / 10 + 1) * WorkScale;

				RunBenchmarkShader(View, MethodId, RTItems[SrcRTIndex], PassCount[Iteration]);

				RHICopyToResolveTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, RTItems[DestRTIndex]->GetRenderTargetItem().ShaderResourceTexture, false, FResolveParams());

				/*if(bGPUCPUSync)
				{
					// more consistent timing but strangely much faster to the level that is unrealistic

					FResolveParams Param;

					Param.Rect = FResolveRect(0, 0, 1, 1);
					RHICopyToResolveTarget(
						RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture,
						RTItems[2]->GetRenderTargetItem().ShaderResourceTexture,
						false,
						Param);

					void* Data = 0;
					int Width = 0;
					int Height = 0;

					RHIMapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture, Data, Width, Height);
					RHIUnmapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture);
				}*/

				RHIEndRenderQuery(TimerQueries[QueryIndex]);

				// ping pong
				DestRTIndex = 1 - DestRTIndex;
			}
		}

		{
			uint64 OldAbsTime = 0;
			RHIGetRenderQueryResult(TimerQueries[0], OldAbsTime, true);
			GTimerQueryPool.ReleaseQuery(TimerQueries[0]);

#if !UE_BUILD_SHIPPING
			FBenchmarkGraph BenchmarkGraph(IterationCount, IterationCount, *(FPaths::ScreenShotDir() + TEXT("GPUSynthBenchmarkGraph.bmp")));
#endif

			for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
			{
				uint32 Results[MethodCount];

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

					uint64 AbsTime;
					RHIGetRenderQueryResult(TimerQueries[QueryIndex], AbsTime, true);
					GTimerQueryPool.ReleaseQuery(TimerQueries[QueryIndex]);

					Results[MethodId] = AbsTime - OldAbsTime;
					OldAbsTime = AbsTime;
				}

				double SamplesInGPix = PassCount[Iteration] * GBenchmarkResolution * GBenchmarkResolution / 1000000000.0;

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					double TimeInSec = Results[MethodId] / 1000000.0;
					double TimingValue = TimeInSec / SamplesInGPix;

					// TimingValue in Seconds per GPixel
					TimingSeries[MethodId].SetEntry(Iteration, (float)TimingValue);
				}

#if !UE_BUILD_SHIPPING
				{
					// This is for debugging and we don't want to change the output but we still use "InOut".
					// That shouldn't hurt, as we override the values after that anyway.

					for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
					{
						InOut.GPUStats[MethodId].SetMeasuredTime(TimingSeries[MethodId].GetEntry(Iteration));
					}

					float LocalGPUIndex = InOut.ComputeGPUPerfIndex();

					// * 0.01 to get it in 0..1 range
					// * 0.5f to have 100 is the middle
					BenchmarkGraph.DrawBar(Iteration, LocalGPUIndex * 0.01f * 0.5f);
				}
#endif
			}

			for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
			{
				float Confidence = 0.0f;
				
				float TimingValue = TimingSeries[MethodId].ComputeValue(Confidence);

				if(Confidence > 0)
				{
					InOut.GPUStats[MethodId].SetMeasuredTime(TimingValue, Confidence);
				}

				UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %.3f GigaPix/s, Confidence=%.0f%% '%s'"),
					1.0f / InOut.GPUStats[MethodId].GetMeasuredTime(), Confidence, InOut.GPUStats[MethodId].GetDesc());
			}

			UE_LOG(LogSynthBenchmark, Display, TEXT(""));
			
#if !UE_BUILD_SHIPPING
			if(bDebugOut)
			{
				BenchmarkGraph.Save();
			}
#endif
		}
	}
}
void FSynthBenchmark::Run(FSynthBenchmarkResults& InOut, bool bGPUBenchmark, float WorkScale) const
{
	check(WorkScale > 0);

	if(!bGPUBenchmark)
	{
		// run a very quick GPU benchmark (less confidence but at least we get some numbers)
		// it costs little time and we get some stats
		WorkScale = 1.0f;
	}

	const double StartTime = FPlatformTime::Seconds();

	UE_LOG(LogSynthBenchmark, Display, TEXT("FSynthBenchmark (V0.95):  requested WorkScale=%.2f"), WorkScale);
	UE_LOG(LogSynthBenchmark, Display, TEXT("==============="));
	
#if UE_BUILD_DEBUG
	UE_LOG(LogSynthBenchmark, Display, TEXT("         Note: Values are not trustable because this is a DEBUG build!"));
#endif
	
	UE_LOG(LogSynthBenchmark, Display, TEXT("Main Processor:"));

	// developer machine: Intel Xeon E5-2660 2.2GHz
	// divided by the actual value on a developer machine to normalize the results
	// Index should be around 100 +-4 on developer machine in a development build (should be the same in shipping)

	InOut.CPUStats[0] = FSynthBenchmarkStat(TEXT("RayIntersect"), 0.02561f, TEXT("s/Run"), 1.f);
	InOut.CPUStats[0].SetMeasuredTime(RunBenchmark(WorkScale, RayIntersectBenchmark));

	InOut.CPUStats[1] = FSynthBenchmarkStat(TEXT("Fractal"), 0.0286f, TEXT("s/Run"), 1.5f);
	InOut.CPUStats[1].SetMeasuredTime(RunBenchmark(WorkScale, FractalBenchmark));

	for(uint32 i = 0; i < ARRAY_COUNT(InOut.CPUStats); ++i)
	{
		UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %f %s '%s'"), InOut.CPUStats[i].GetNormalizedTime(), InOut.CPUStats[i].GetValueType(), InOut.CPUStats[i].GetDesc());
	}

	UE_LOG(LogSynthBenchmark, Display, TEXT(""));

	bool bAppIs64Bit = (sizeof(void*) == 8);

	UE_LOG(LogSynthBenchmark, Display, TEXT("  CompiledTarget_x_Bits: %s"), bAppIs64Bit ? TEXT("64") : TEXT("32"));
	UE_LOG(LogSynthBenchmark, Display, TEXT("  UE_BUILD_SHIPPING: %d"), UE_BUILD_SHIPPING);
	UE_LOG(LogSynthBenchmark, Display, TEXT("  UE_BUILD_TEST: %d"), UE_BUILD_TEST);
	UE_LOG(LogSynthBenchmark, Display, TEXT("  UE_BUILD_DEBUG: %d"), UE_BUILD_DEBUG);

	UE_LOG(LogSynthBenchmark, Display, TEXT("  TotalPhysicalGBRam: %d"), FPlatformMemory::GetPhysicalGBRam());
	UE_LOG(LogSynthBenchmark, Display, TEXT("  NumberOfCores (physical): %d"), FPlatformMisc::NumberOfCores());
	UE_LOG(LogSynthBenchmark, Display, TEXT("  NumberOfCores (logical): %d"), FPlatformMisc::NumberOfCoresIncludingHyperthreads());

	UE_LOG(LogSynthBenchmark, Display, TEXT("  CPU Perf Index 0: %.1f (weight %.2f)"), InOut.CPUStats[0].ComputePerfIndex(), InOut.CPUStats[0].GetWeight());
	UE_LOG(LogSynthBenchmark, Display, TEXT("  CPU Perf Index 1: %.1f (weight %.2f)"), InOut.CPUStats[1].ComputePerfIndex(), InOut.CPUStats[1].GetWeight());
	
	// separator line
	UE_LOG(LogSynthBenchmark, Display, TEXT(" "));

	UE_LOG(LogSynthBenchmark, Display, TEXT("Graphics:"));

	UE_LOG(LogSynthBenchmark, Display, TEXT("  Adapter Name: '%s'"), *GRHIAdapterName);
	UE_LOG(LogSynthBenchmark, Display, TEXT("  (On Optimus the name might be wrong, memory should be ok)"));
	UE_LOG(LogSynthBenchmark, Display, TEXT("  Vendor Id: 0x%x"), GRHIVendorId);

	{
		FTextureMemoryStats Stats;

		RHIGetTextureMemoryStats(Stats);

		if(Stats.AreHardwareStatsValid())
		{
			UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Memory: %d/%d/%d MB"), 
				FMath::DivideAndRoundUp(Stats.DedicatedVideoMemory, (int64)(1024 * 1024) ),
				FMath::DivideAndRoundUp(Stats.DedicatedSystemMemory, (int64)(1024 * 1024) ),
				FMath::DivideAndRoundUp(Stats.SharedSystemMemory, (int64)(1024 * 1024) ));
		}
	}

	// not always done - cost some time.
	if(bGPUBenchmark)
	{
		IRendererModule& RendererModule = FModuleManager::LoadModuleChecked<IRendererModule>(TEXT("Renderer"));

		// First we run a quick test. If that shows very bad performance we don't need another test
		// The hardware is slow, we don't need a long test and risk driver TDR (driver recovery).
		// We have seen this problem on very low end GPUs.
		{
			const float fFirstWorkScale = 0.01f;
			const float fSecondWorkScale = 0.1f;

			float GPUTime = 0.0f;

			RendererModule.GPUBenchmark(InOut, fFirstWorkScale);
			GPUTime = InOut.ComputeTotalGPUTime();
			UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU first test: %.2fs"), GPUTime);

			for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId)
			{
				UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %.3f GigaPix/s, Confidence=%.0f%% '%s' (likely to be very inaccurate)"),
					1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc());
			}

			if(GPUTime < 0.1f)
			{
				RendererModule.GPUBenchmark(InOut, fSecondWorkScale);
				GPUTime = InOut.ComputeTotalGPUTime();
				UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU second test: %.2fs"), GPUTime);

				// for testing
				for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId)
				{
					UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %.3f GigaPix/s, Confidence=%.0f%% '%s' (likely to be inaccurate)"),
						1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc());
				}

				if(GPUTime < 0.1f)
				{
					RendererModule.GPUBenchmark(InOut, WorkScale);
					GPUTime = InOut.ComputeTotalGPUTime();
					UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU third test: %.2fs"), GPUTime);
				}
			}
		}

		for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId)
		{
			UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %.3f GigaPix/s, Confidence=%.0f%% '%s'"),
				1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc());
		}
		UE_LOG(LogSynthBenchmark, Display, TEXT(""));

		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Perf Index 0: %.1f (weight %.2f)"), InOut.GPUStats[0].ComputePerfIndex(), InOut.GPUStats[0].GetWeight());
		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Perf Index 1: %.1f (weight %.2f)"), InOut.GPUStats[1].ComputePerfIndex(), InOut.GPUStats[1].GetWeight());
		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Perf Index 2: %.1f (weight %.2f)"), InOut.GPUStats[2].ComputePerfIndex(), InOut.GPUStats[2].GetWeight());
		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Perf Index 3: %.1f (weight %.2f)"), InOut.GPUStats[3].ComputePerfIndex(), InOut.GPUStats[3].GetWeight());
		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPU Perf Index 4: %.1f (weight %.2f)"), InOut.GPUStats[4].ComputePerfIndex(), InOut.GPUStats[4].GetWeight());
	}
	
	UE_LOG(LogSynthBenchmark, Display, TEXT("  CPUIndex: %.1f"), InOut.ComputeCPUPerfIndex());

	if(bGPUBenchmark)
	{
		UE_LOG(LogSynthBenchmark, Display, TEXT("  GPUIndex: %.1f"), InOut.ComputeGPUPerfIndex());
	}

	UE_LOG(LogSynthBenchmark, Display, TEXT(""));
	UE_LOG(LogSynthBenchmark, Display, TEXT("         ... Total Time: %f sec"),  (float)(FPlatformTime::Seconds() - StartTime));
}