void RendererGPUBenchmark(FSynthBenchmarkResults& InOut, const FSceneView& View, uint32 WorkScale, bool bDebugOut)
{
	check(IsInRenderingThread());
	
	// two RT to ping pong so we force the GPU to flush it's pipeline
	TRefCountPtr<IPooledRenderTarget> RTItems[3];
	{
		FPooledRenderTargetDesc Desc(FPooledRenderTargetDesc::Create2DDesc(FIntPoint(GBenchmarkResolution, GBenchmarkResolution), PF_B8G8R8A8, TexCreate_None, TexCreate_RenderTargetable | TexCreate_ShaderResource, false));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[0], TEXT("Benchmark0"));
		GRenderTargetPool.FindFreeElement(Desc, RTItems[1], TEXT("Benchmark1"));

		Desc.Extent = FIntPoint(1, 1);
		Desc.Flags = TexCreate_CPUReadback;	// needs TexCreate_ResolveTargetable?
		Desc.TargetableFlags = TexCreate_None;

		GRenderTargetPool.FindFreeElement(Desc, RTItems[2], TEXT("BenchmarkReadback"));
	}

	// set the state
	RHISetBlendState(TStaticBlendState<>::GetRHI());
	RHISetRasterizerState(TStaticRasterizerState<>::GetRHI());
	RHISetDepthStencilState(TStaticDepthStencilState<false,CF_Always>::GetRHI());

	{
		// larger number means more accuracy but slower, some slower GPUs might timeout with a number to large
		const uint32 IterationCount = 70;
		const uint32 MethodCount = ARRAY_COUNT(InOut.GPUStats);

		// 0 / 1
		uint32 DestRTIndex = 0;

		const uint32 TimerSampleCount = IterationCount * MethodCount + 1;

		static FRenderQueryRHIRef TimerQueries[TimerSampleCount];
		static uint32 PassCount[IterationCount];

		for(uint32  i = 0; i < TimerSampleCount; ++i)
		{
			TimerQueries[i] = GTimerQueryPool.AllocateQuery();
		}

		if(!TimerQueries[0])
		{
			UE_LOG(LogSynthBenchmark, Warning, TEXT("GPU driver does not support timer queries."));
		}

		// TimingValues are in Seconds per GPixel
		FTimingSeries TimingSeries[MethodCount];
		
		for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
		{
			TimingSeries[MethodIterator].Init(IterationCount);
		}

		check(MethodCount == 5);
		InOut.GPUStats[0] = FSynthBenchmarkStat(TEXT("ALUHeavyNoise"), 1.0f / 4.601f, TEXT("s/GigaPix"));
		InOut.GPUStats[1] = FSynthBenchmarkStat(TEXT("TexHeavy"), 1.0f / 7.447f, TEXT("s/GigaPix"));
		InOut.GPUStats[2] = FSynthBenchmarkStat(TEXT("DepTexHeavy"), 1.0f / 3.847f, TEXT("s/GigaPix"));
		InOut.GPUStats[3] = FSynthBenchmarkStat(TEXT("FillOnly"), 1.0f / 25.463f, TEXT("s/GigaPix"));
		InOut.GPUStats[4] = FSynthBenchmarkStat(TEXT("Bandwidth"), 1.0f / 1.072f, TEXT("s/GigaPix"));

		// e.g. on NV670: Method3 (mostly fill rate )-> 26GP/s (seems realistic)
		// reference: http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units theoretical: 29.3G/s

		RHIEndRenderQuery(TimerQueries[0]);

		// multiple iterations to see how trust able the values are
		for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
		{
			for(uint32 MethodIterator = 0; MethodIterator < MethodCount; ++MethodIterator)
			{
				// alternate between forward and backward (should give the same number)
				//			uint32 MethodId = (Iteration % 2) ? MethodIterator : (MethodCount - 1 - MethodIterator);
				uint32 MethodId = MethodIterator;

				uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

				// 0 / 1
				const uint32 SrcRTIndex = 1 - DestRTIndex;

				GRenderTargetPool.VisualizeTexture.SetCheckPoint(RTItems[DestRTIndex]);

				RHISetRenderTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, FTextureRHIRef());	

				// decide how much work we do in this pass
				PassCount[Iteration] = (Iteration / 10 + 1) * WorkScale;

				RunBenchmarkShader(View, MethodId, RTItems[SrcRTIndex], PassCount[Iteration]);

				RHICopyToResolveTarget(RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture, RTItems[DestRTIndex]->GetRenderTargetItem().ShaderResourceTexture, false, FResolveParams());

				/*if(bGPUCPUSync)
				{
					// more consistent timing but strangely much faster to the level that is unrealistic

					FResolveParams Param;

					Param.Rect = FResolveRect(0, 0, 1, 1);
					RHICopyToResolveTarget(
						RTItems[DestRTIndex]->GetRenderTargetItem().TargetableTexture,
						RTItems[2]->GetRenderTargetItem().ShaderResourceTexture,
						false,
						Param);

					void* Data = 0;
					int Width = 0;
					int Height = 0;

					RHIMapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture, Data, Width, Height);
					RHIUnmapStagingSurface(RTItems[2]->GetRenderTargetItem().ShaderResourceTexture);
				}*/

				RHIEndRenderQuery(TimerQueries[QueryIndex]);

				// ping pong
				DestRTIndex = 1 - DestRTIndex;
			}
		}

		{
			uint64 OldAbsTime = 0;
			RHIGetRenderQueryResult(TimerQueries[0], OldAbsTime, true);
			GTimerQueryPool.ReleaseQuery(TimerQueries[0]);

#if !UE_BUILD_SHIPPING
			FBenchmarkGraph BenchmarkGraph(IterationCount, IterationCount, *(FPaths::ScreenShotDir() + TEXT("GPUSynthBenchmarkGraph.bmp")));
#endif

			for(uint32 Iteration = 0; Iteration < IterationCount; ++Iteration)
			{
				uint32 Results[MethodCount];

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					uint32 QueryIndex = 1 + Iteration * MethodCount + MethodId;

					uint64 AbsTime;
					RHIGetRenderQueryResult(TimerQueries[QueryIndex], AbsTime, true);
					GTimerQueryPool.ReleaseQuery(TimerQueries[QueryIndex]);

					Results[MethodId] = AbsTime - OldAbsTime;
					OldAbsTime = AbsTime;
				}

				double SamplesInGPix = PassCount[Iteration] * GBenchmarkResolution * GBenchmarkResolution / 1000000000.0;

				for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
				{
					double TimeInSec = Results[MethodId] / 1000000.0;
					double TimingValue = TimeInSec / SamplesInGPix;

					// TimingValue in Seconds per GPixel
					TimingSeries[MethodId].SetEntry(Iteration, (float)TimingValue);
				}

#if !UE_BUILD_SHIPPING
				{
					// This is for debugging and we don't want to change the output but we still use "InOut".
					// That shouldn't hurt, as we override the values after that anyway.

					for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
					{
						InOut.GPUStats[MethodId].SetMeasuredTime(TimingSeries[MethodId].GetEntry(Iteration));
					}

					float LocalGPUIndex = InOut.ComputeGPUPerfIndex();

					// * 0.01 to get it in 0..1 range
					// * 0.5f to have 100 is the middle
					BenchmarkGraph.DrawBar(Iteration, LocalGPUIndex * 0.01f * 0.5f);
				}
#endif
			}

			for(uint32 MethodId = 0; MethodId < MethodCount; ++MethodId)
			{
				float Confidence = 0.0f;
				
				float TimingValue = TimingSeries[MethodId].ComputeValue(Confidence);

				if(Confidence > 0)
				{
					InOut.GPUStats[MethodId].SetMeasuredTime(TimingValue, Confidence);
				}

				UE_LOG(LogSynthBenchmark, Display, TEXT("         ... %.3f GigaPix/s, Confidence=%.0f%% '%s'"),
					1.0f / InOut.GPUStats[MethodId].GetMeasuredTime(), Confidence, InOut.GPUStats[MethodId].GetDesc());
			}

			UE_LOG(LogSynthBenchmark, Display, TEXT(""));
			
#if !UE_BUILD_SHIPPING
			if(bDebugOut)
			{
				BenchmarkGraph.Save();
			}
#endif
		}
	}
}
static void UpdatePlanarReflectionContents_RenderThread(
	FRHICommandListImmediate& RHICmdList, 
	FSceneRenderer* MainSceneRenderer, 
	FSceneRenderer* SceneRenderer, 
	const FPlanarReflectionSceneProxy* SceneProxy,
	FRenderTarget* RenderTarget, 
	FTexture* RenderTargetTexture, 
	const FPlane& MirrorPlane,
	const FName OwnerName, 
	const FResolveParams& ResolveParams, 
	bool bUseSceneColorTexture)
{
	QUICK_SCOPE_CYCLE_COUNTER(STAT_RenderPlanarReflection);

	FBox PlanarReflectionBounds = SceneProxy->WorldBounds;

	bool bIsInAnyFrustum = false;
	for (int32 ViewIndex = 0; ViewIndex < SceneRenderer->Views.Num(); ++ViewIndex)
	{
		FViewInfo& View = SceneRenderer->Views[ViewIndex];
		if (View.ViewFrustum.IntersectBox(PlanarReflectionBounds.GetCenter(), PlanarReflectionBounds.GetExtent()))
		{
			bIsInAnyFrustum = true;
			break;
		}
	}

	if (bIsInAnyFrustum)
	{
		bool bIsVisibleInAnyView = true;
		for (int32 ViewIndex = 0; ViewIndex < SceneRenderer->Views.Num(); ++ViewIndex)
		{
			FViewInfo& View = SceneRenderer->Views[ViewIndex];
			FSceneViewState* ViewState = View.ViewState;

			if (ViewState)
			{
				FIndividualOcclusionHistory& OcclusionHistory = ViewState->PlanarReflectionOcclusionHistories.FindOrAdd(SceneProxy->PlanarReflectionId);

				// +1 to buffered frames because the query is submitted late into the main frame, but read at the beginning of a reflection capture frame
				const int32 NumBufferedFrames = FOcclusionQueryHelpers::GetNumBufferedFrames() + 1;
				// +1 to frame counter because we are operating before the main view's InitViews, which is where OcclusionFrameCounter is incremented
				uint32 OcclusionFrameCounter = ViewState->OcclusionFrameCounter + 1;
				FRenderQueryRHIRef& PastQuery = OcclusionHistory.GetPastQuery(OcclusionFrameCounter, NumBufferedFrames);

				if (IsValidRef(PastQuery))
				{
					uint64 NumSamples = 0;
					QUICK_SCOPE_CYCLE_COUNTER(STAT_PlanarReflectionOcclusionQueryResults);

					if (RHIGetRenderQueryResult(PastQuery.GetReference(), NumSamples, true))
					{
						bIsVisibleInAnyView = NumSamples > 0;
						if (bIsVisibleInAnyView)
						{
							break;
						}
					}
				}
			}
		}

		if (bIsVisibleInAnyView)
		{
			FMemMark MemStackMark(FMemStack::Get());

			// update any resources that needed a deferred update
			FDeferredUpdateResource::UpdateResources(RHICmdList);

			{
#if WANTS_DRAW_MESH_EVENTS
				FString EventName;
				OwnerName.ToString(EventName);
				SCOPED_DRAW_EVENTF(RHICmdList, SceneCapture, TEXT("PlanarReflection %s"), *EventName);
#else
				SCOPED_DRAW_EVENT(RHICmdList, UpdatePlanarReflectionContent_RenderThread);
#endif

				const FRenderTarget* Target = SceneRenderer->ViewFamily.RenderTarget;
				SetRenderTarget(RHICmdList, Target->GetRenderTargetTexture(), NULL, true);

				// Note: relying on GBuffer SceneColor alpha being cleared to 1 in the main scene rendering
				check(GetSceneColorClearAlpha() == 1.0f);
				RHICmdList.Clear(true, FLinearColor(0, 0, 0, 1), false, (float)ERHIZBuffer::FarPlane, false, 0, FIntRect());

				// Reflection view late update
				if (SceneRenderer->Views.Num() > 1)
				{
					const FMirrorMatrix MirrorMatrix(MirrorPlane);
					for (int32 ViewIndex = 0; ViewIndex < SceneRenderer->Views.Num(); ++ViewIndex)
					{
						FViewInfo& ReflectionViewToUpdate = SceneRenderer->Views[ViewIndex];
						const FViewInfo& UpdatedParentView = MainSceneRenderer->Views[ViewIndex];

						ReflectionViewToUpdate.UpdatePlanarReflectionViewMatrix(UpdatedParentView, MirrorMatrix);
					}
				}

				// Render the scene normally
				{
					SCOPED_DRAW_EVENT(RHICmdList, RenderScene);
					SceneRenderer->Render(RHICmdList);
				}

				for (int32 ViewIndex = 0; ViewIndex < SceneRenderer->Views.Num(); ++ViewIndex)
				{
					FViewInfo& View = SceneRenderer->Views[ViewIndex];
					if (MainSceneRenderer->Scene->GetShadingPath() == EShadingPath::Deferred)
					{
						PrefilterPlanarReflection<true>(RHICmdList, View, SceneProxy, Target);
					}
					else
					{
						PrefilterPlanarReflection<false>(RHICmdList, View, SceneProxy, Target);
					}
				}
				RHICmdList.CopyToResolveTarget(RenderTarget->GetRenderTargetTexture(), RenderTargetTexture->TextureRHI, false, ResolveParams);
			}
			FSceneRenderer::WaitForTasksClearSnapshotsAndDeleteSceneRenderer(RHICmdList, SceneRenderer);
		}
	}
}