FQualityLevels BenchmarkQualityLevels() { // benchmark the system FQualityLevels Results; FSynthBenchmarkResults SynthBenchmark; ISynthBenchmark::Get().Run(SynthBenchmark, true); float CPUPerfIndex = SynthBenchmark.ComputeCPUPerfIndex(); float GPUPerfIndex = SynthBenchmark.ComputeGPUPerfIndex(); float MinPerfIndex = FMath::Min(CPUPerfIndex, GPUPerfIndex); // decide on the actual quality needed Results.ResolutionQuality = GetRenderScaleLevelFromQualityLevel(ComputeOptionFromPerfIndex(GPUPerfIndex, 15, 45, 70)); Results.ViewDistanceQuality = ComputeOptionFromPerfIndex(MinPerfIndex, 20, 50, 70); Results.AntiAliasingQuality = ComputeOptionFromPerfIndex(GPUPerfIndex, 15, 50, 70); Results.ShadowQuality = ComputeOptionFromPerfIndex(MinPerfIndex, 15, 50, 70); Results.PostProcessQuality = ComputeOptionFromPerfIndex(GPUPerfIndex, 20, 50, 70); Results.TextureQuality = ComputeOptionFromPerfIndex(GPUPerfIndex, 10, 40, 70); Results.EffectsQuality = ComputeOptionFromPerfIndex(MinPerfIndex, 25, 55, 70); return Results; }
void RendererGPUBenchmark(FSynthBenchmarkResults& InOut, const FSceneView& View, uint32 WorkScale, bool bDebugOut) { check(IsInRenderingThread()); // two RT to ping pong so we force the GPU to flush it's pipeline TRefCountPtr<IPooledRenderTarget> RTItems[3]; { FPooledRenderTargetDesc Desc(FPooledRenderTargetDesc::Create2DDesc(FIntPoint(GBenchmarkResolution, GBenchmarkResolution), PF_B8G8R8A8, TexCreate_None, TexCreate_RenderTargetable | TexCreate_ShaderResource, false)); GRenderTargetPool.FindFreeElement(Desc, RTItems[0], TEXT("Benchmark0")); GRenderTargetPool.FindFreeElement(Desc, RTItems[1], TEXT("Benchmark1")); Desc.Extent = FIntPoint(1, 1); Desc.Flags = TexCreate_CPUReadback; // needs TexCreate_ResolveTargetable? Desc.TargetableFlags = TexCreate_None; GRenderTargetPool.FindFreeElement(Desc, RTItems[2], TEXT("BenchmarkReadback")); } // set the state RHISetBlendState(TStaticBlendState<>::GetRHI()); RHISetRasterizerState(TStaticRasterizerState<>::GetRHI()); RHISetDepthStencilState(TStaticDepthStencilState<false,CF_Always>::GetRHI()); { // larger number means more accuracy but slower, some slower GPUs might timeout with a number to large const uint32 IterationCount = 70; const uint32 MethodCount = ARRAY_COUNT(InOut.GPUStats); // 0 / 1 uint32 DestRTIndex = 0; const uint32 TimerSampleCount = IterationCount * MethodCount + 1; static FRenderQueryRHIRef TimerQueries[TimerSampleCount]; static uint32 PassCount[IterationCount]; for(uint32 i = 0; i < TimerSampleCount; ++i) { TimerQueries[i] = GTimerQueryPool.AllocateQuery(); } if(!TimerQueries[0]) { UE_LOG(LogSynthBenchmark, Warning, TEXT("GPU driver does not support timer queries.")); } // TimingValues are in Seconds per GPixel FTimingSeries TimingSeries[MethodCount]; for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator) { TimingSeries[MethodIterator].Init(IterationCount); } check(MethodCount == 5); InOut.GPUStats[0] = FSynthBenchmarkStat(TEXT("ALUHeavyNoise"), 1.0f / 4.601f, TEXT("s/GigaPix")); InOut.GPUStats[1] = FSynthBenchmarkStat(TEXT("TexHeavy"), 1.0f / 7.447f, TEXT("s/GigaPix")); InOut.GPUStats[2] = FSynthBenchmarkStat(TEXT("DepTexHeavy"), 1.0f / 3.847f, TEXT("s/GigaPix")); InOut.GPUStats[3] = FSynthBenchmarkStat(TEXT("FillOnly"), 1.0f / 25.463f, TEXT("s/GigaPix")); InOut.GPUStats[4] = FSynthBenchmarkStat(TEXT("Bandwidth"), 1.0f / 1.072f, TEXT("s/GigaPix")); // e.g. on NV670: Method3 (mostly fill rate )-> 26GP/s (seems realistic) // reference: http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units theoretical: 29.3G/s RHIEndRenderQuery(TimerQueries[0]); // multiple iterations to see how trust able the values are for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration) { for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator) { // alternate between forward and backward (should give the same number) // uint32 MethodId = (Iteration % 2) ? MethodIterator : (MethodCount - 1 - MethodIterator); uint32 MethodId = MethodIterator; uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId; // 0 / 1 const uint32 SrcRTIndex = 1 - DestRTIndex; GRenderTargetPool.VisualizeTexture.SetCheckPoint(RTItems[DestRTIndex]); RHISetRenderTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, FTextureRHIRef()); // decide how much work we do in this pass PassCount[Iteration] = (Iteration / 10 + 1) * WorkScale; RunBenchmarkShader(View, MethodId, RTItems[SrcRTIndex], PassCount[Iteration]); RHICopyToResolveTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, RTItems[DestRTIndex]->GetRenderTargetItem().ShaderResourceTexture, false, FResolveParams()); /*if(bGPUCPUSync) { // more consistent timing but strangely much faster to the level that is unrealistic FResolveParams Param; Param.Rect = FResolveRect(0, 0, 1, 1); RHICopyToResolveTarget( RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, RTItems[2]->GetRenderTargetItem().ShaderResourceTexture, false, Param); void* Data = 0; int Width = 0; int Height = 0; RHIMapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture, Data, Width, Height); RHIUnmapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture); }*/ RHIEndRenderQuery(TimerQueries[QueryIndex]); // ping pong DestRTIndex = 1 - DestRTIndex; } } { uint64 OldAbsTime = 0; RHIGetRenderQueryResult(TimerQueries[0], OldAbsTime, true); GTimerQueryPool.ReleaseQuery(TimerQueries[0]); #if !UE_BUILD_SHIPPING FBenchmarkGraph BenchmarkGraph(IterationCount, IterationCount, *(FPaths::ScreenShotDir() + TEXT("GPUSynthBenchmarkGraph.bmp"))); #endif for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration) { uint32 Results[MethodCount]; for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId) { uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId; uint64 AbsTime; RHIGetRenderQueryResult(TimerQueries[QueryIndex], AbsTime, true); GTimerQueryPool.ReleaseQuery(TimerQueries[QueryIndex]); Results[MethodId] = AbsTime - OldAbsTime; OldAbsTime = AbsTime; } double SamplesInGPix = PassCount[Iteration] * GBenchmarkResolution * GBenchmarkResolution / 1000000000.0; for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId) { double TimeInSec = Results[MethodId] / 1000000.0; double TimingValue = TimeInSec / SamplesInGPix; // TimingValue in Seconds per GPixel TimingSeries[MethodId].SetEntry(Iteration, (float)TimingValue); } #if !UE_BUILD_SHIPPING { // This is for debugging and we don't want to change the output but we still use "InOut". // That shouldn't hurt, as we override the values after that anyway. for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId) { InOut.GPUStats[MethodId].SetMeasuredTime(TimingSeries[MethodId].GetEntry(Iteration)); } float LocalGPUIndex = InOut.ComputeGPUPerfIndex(); // * 0.01 to get it in 0..1 range // * 0.5f to have 100 is the middle BenchmarkGraph.DrawBar(Iteration, LocalGPUIndex * 0.01f * 0.5f); } #endif } for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId) { float Confidence = 0.0f; float TimingValue = TimingSeries[MethodId].ComputeValue(Confidence); if(Confidence > 0) { InOut.GPUStats[MethodId].SetMeasuredTime(TimingValue, Confidence); } UE_LOG(LogSynthBenchmark, Display, TEXT(" ... %.3f GigaPix/s, Confidence=%.0f%% '%s'"), 1.0f / InOut.GPUStats[MethodId].GetMeasuredTime(), Confidence, InOut.GPUStats[MethodId].GetDesc()); } UE_LOG(LogSynthBenchmark, Display, TEXT("")); #if !UE_BUILD_SHIPPING if(bDebugOut) { BenchmarkGraph.Save(); } #endif } } }
void FSynthBenchmark::Run(FSynthBenchmarkResults& InOut, bool bGPUBenchmark, float WorkScale) const { check(WorkScale > 0); if(!bGPUBenchmark) { // run a very quick GPU benchmark (less confidence but at least we get some numbers) // it costs little time and we get some stats WorkScale = 1.0f; } const double StartTime = FPlatformTime::Seconds(); UE_LOG(LogSynthBenchmark, Display, TEXT("FSynthBenchmark (V0.95): requested WorkScale=%.2f"), WorkScale); UE_LOG(LogSynthBenchmark, Display, TEXT("===============")); #if UE_BUILD_DEBUG UE_LOG(LogSynthBenchmark, Display, TEXT(" Note: Values are not trustable because this is a DEBUG build!")); #endif UE_LOG(LogSynthBenchmark, Display, TEXT("Main Processor:")); // developer machine: Intel Xeon E5-2660 2.2GHz // divided by the actual value on a developer machine to normalize the results // Index should be around 100 +-4 on developer machine in a development build (should be the same in shipping) InOut.CPUStats[0] = FSynthBenchmarkStat(TEXT("RayIntersect"), 0.02561f, TEXT("s/Run"), 1.f); InOut.CPUStats[0].SetMeasuredTime(RunBenchmark(WorkScale, RayIntersectBenchmark)); InOut.CPUStats[1] = FSynthBenchmarkStat(TEXT("Fractal"), 0.0286f, TEXT("s/Run"), 1.5f); InOut.CPUStats[1].SetMeasuredTime(RunBenchmark(WorkScale, FractalBenchmark)); for(uint32 i = 0; i < ARRAY_COUNT(InOut.CPUStats); ++i) { UE_LOG(LogSynthBenchmark, Display, TEXT(" ... %f %s '%s'"), InOut.CPUStats[i].GetNormalizedTime(), InOut.CPUStats[i].GetValueType(), InOut.CPUStats[i].GetDesc()); } UE_LOG(LogSynthBenchmark, Display, TEXT("")); bool bAppIs64Bit = (sizeof(void*) == 8); UE_LOG(LogSynthBenchmark, Display, TEXT(" CompiledTarget_x_Bits: %s"), bAppIs64Bit ? TEXT("64") : TEXT("32")); UE_LOG(LogSynthBenchmark, Display, TEXT(" UE_BUILD_SHIPPING: %d"), UE_BUILD_SHIPPING); UE_LOG(LogSynthBenchmark, Display, TEXT(" UE_BUILD_TEST: %d"), UE_BUILD_TEST); UE_LOG(LogSynthBenchmark, Display, TEXT(" UE_BUILD_DEBUG: %d"), UE_BUILD_DEBUG); UE_LOG(LogSynthBenchmark, Display, TEXT(" TotalPhysicalGBRam: %d"), FPlatformMemory::GetPhysicalGBRam()); UE_LOG(LogSynthBenchmark, Display, TEXT(" NumberOfCores (physical): %d"), FPlatformMisc::NumberOfCores()); UE_LOG(LogSynthBenchmark, Display, TEXT(" NumberOfCores (logical): %d"), FPlatformMisc::NumberOfCoresIncludingHyperthreads()); UE_LOG(LogSynthBenchmark, Display, TEXT(" CPU Perf Index 0: %.1f (weight %.2f)"), InOut.CPUStats[0].ComputePerfIndex(), InOut.CPUStats[0].GetWeight()); UE_LOG(LogSynthBenchmark, Display, TEXT(" CPU Perf Index 1: %.1f (weight %.2f)"), InOut.CPUStats[1].ComputePerfIndex(), InOut.CPUStats[1].GetWeight()); // separator line UE_LOG(LogSynthBenchmark, Display, TEXT(" ")); UE_LOG(LogSynthBenchmark, Display, TEXT("Graphics:")); UE_LOG(LogSynthBenchmark, Display, TEXT(" Adapter Name: '%s'"), *GRHIAdapterName); UE_LOG(LogSynthBenchmark, Display, TEXT(" (On Optimus the name might be wrong, memory should be ok)")); UE_LOG(LogSynthBenchmark, Display, TEXT(" Vendor Id: 0x%x"), GRHIVendorId); { FTextureMemoryStats Stats; RHIGetTextureMemoryStats(Stats); if(Stats.AreHardwareStatsValid()) { UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Memory: %d/%d/%d MB"), FMath::DivideAndRoundUp(Stats.DedicatedVideoMemory, (int64)(1024 * 1024) ), FMath::DivideAndRoundUp(Stats.DedicatedSystemMemory, (int64)(1024 * 1024) ), FMath::DivideAndRoundUp(Stats.SharedSystemMemory, (int64)(1024 * 1024) )); } } // not always done - cost some time. if(bGPUBenchmark) { IRendererModule& RendererModule = FModuleManager::LoadModuleChecked<IRendererModule>(TEXT("Renderer")); // First we run a quick test. If that shows very bad performance we don't need another test // The hardware is slow, we don't need a long test and risk driver TDR (driver recovery). // We have seen this problem on very low end GPUs. { const float fFirstWorkScale = 0.01f; const float fSecondWorkScale = 0.1f; float GPUTime = 0.0f; RendererModule.GPUBenchmark(InOut, fFirstWorkScale); GPUTime = InOut.ComputeTotalGPUTime(); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU first test: %.2fs"), GPUTime); for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId) { UE_LOG(LogSynthBenchmark, Display, TEXT(" ... %.3f GigaPix/s, Confidence=%.0f%% '%s' (likely to be very inaccurate)"), 1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc()); } if(GPUTime < 0.1f) { RendererModule.GPUBenchmark(InOut, fSecondWorkScale); GPUTime = InOut.ComputeTotalGPUTime(); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU second test: %.2fs"), GPUTime); // for testing for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId) { UE_LOG(LogSynthBenchmark, Display, TEXT(" ... %.3f GigaPix/s, Confidence=%.0f%% '%s' (likely to be inaccurate)"), 1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc()); } if(GPUTime < 0.1f) { RendererModule.GPUBenchmark(InOut, WorkScale); GPUTime = InOut.ComputeTotalGPUTime(); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU third test: %.2fs"), GPUTime); } } } for(uint32 MethodId = 0; MethodId < sizeof(InOut.GPUStats) / sizeof(InOut.GPUStats[0]); ++MethodId) { UE_LOG(LogSynthBenchmark, Display, TEXT(" ... %.3f GigaPix/s, Confidence=%.0f%% '%s'"), 1.0f / InOut.GPUStats[MethodId].GetNormalizedTime(), InOut.GPUStats[MethodId].GetConfidence(), InOut.GPUStats[MethodId].GetDesc()); } UE_LOG(LogSynthBenchmark, Display, TEXT("")); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Perf Index 0: %.1f (weight %.2f)"), InOut.GPUStats[0].ComputePerfIndex(), InOut.GPUStats[0].GetWeight()); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Perf Index 1: %.1f (weight %.2f)"), InOut.GPUStats[1].ComputePerfIndex(), InOut.GPUStats[1].GetWeight()); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Perf Index 2: %.1f (weight %.2f)"), InOut.GPUStats[2].ComputePerfIndex(), InOut.GPUStats[2].GetWeight()); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Perf Index 3: %.1f (weight %.2f)"), InOut.GPUStats[3].ComputePerfIndex(), InOut.GPUStats[3].GetWeight()); UE_LOG(LogSynthBenchmark, Display, TEXT(" GPU Perf Index 4: %.1f (weight %.2f)"), InOut.GPUStats[4].ComputePerfIndex(), InOut.GPUStats[4].GetWeight()); } UE_LOG(LogSynthBenchmark, Display, TEXT(" CPUIndex: %.1f"), InOut.ComputeCPUPerfIndex()); if(bGPUBenchmark) { UE_LOG(LogSynthBenchmark, Display, TEXT(" GPUIndex: %.1f"), InOut.ComputeGPUPerfIndex()); } UE_LOG(LogSynthBenchmark, Display, TEXT("")); UE_LOG(LogSynthBenchmark, Display, TEXT(" ... Total Time: %f sec"), (float)(FPlatformTime::Seconds() - StartTime)); }