VkResult VktWrappedQueue::QueueBindSparse(VkQueue queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, VkFence fence) { const FuncId funcId = FuncId_vkQueueBindSparse; VkResult result = VK_INCOMPLETE; if (m_createInfo.pInterceptMgr->ShouldCollectTrace()) { char argumentsBuffer[ARGUMENTS_BUFFER_SIZE]; sprintf_s(argumentsBuffer, ARGUMENTS_BUFFER_SIZE, "0x%p, %u, 0x%p, 0x%p", queue, bindInfoCount, PrintArrayWithFormatter(bindInfoCount, pBindInfo, "0x%p").c_str(), fence); VktAPIEntry* pNewEntry = m_createInfo.pInterceptMgr->PreCall(funcId, argumentsBuffer); result = device_dispatch_table(queue)->QueueBindSparse(queue, bindInfoCount, pBindInfo, fence); m_createInfo.pInterceptMgr->PostCall(pNewEntry, result); } else { result = device_dispatch_table(queue)->QueueBindSparse(queue, bindInfoCount, pBindInfo, fence); } return result; }
VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(VkDevice device, const char *pName) { if (!strcmp("vkGetDeviceProcAddr", pName)) return (PFN_vkVoidFunction)vkGetDeviceProcAddr; if (!strcmp("vkDestroyDevice", pName)) return (PFN_vkVoidFunction)basic_DestroyDevice; if (!strcmp("vkLayerBasicEXT", pName)) return (PFN_vkVoidFunction)vkLayerBasicEXT; if (device == NULL) return NULL; if (device_dispatch_table(device)->GetDeviceProcAddr == NULL) return NULL; return device_dispatch_table(device)->GetDeviceProcAddr(device, pName); }
//----------------------------------------------------------------------------- /// Perform all profiler initialization. /// \param config A pointer to a profiler configuration structure. /// \returns The result code for initialization. //----------------------------------------------------------------------------- VkResult VktCmdBufProfiler::Init(const VktCmdBufProfilerConfig& config) ///< [in] Pointer to profiler configuration { VkResult result = VK_INCOMPLETE; if ((config.physicalDevice != VK_NULL_HANDLE) && (config.device != VK_NULL_HANDLE)) { memcpy(&m_config, &config, sizeof(m_config)); m_pInstanceDT = instance_dispatch_table(config.physicalDevice); m_pDeviceDT = device_dispatch_table(config.device); m_pInstanceDT->GetPhysicalDeviceMemoryProperties(config.physicalDevice, &m_memProps); m_pInstanceDT->GetPhysicalDeviceProperties(config.physicalDevice, &m_physicalDeviceProps); m_gpuTimestampFreq = 1000000000.0f / m_physicalDeviceProps.limits.timestampPeriod; m_maxQueriesPerGroup = m_config.measurementsPerGroup * ProfilerTimestampsPerMeasurement; ClearCmdBufData(); result = VK_SUCCESS; } return result; }
//----------------------------------------------------------------------------- /// Kill all info retained by this thread. //----------------------------------------------------------------------------- void VktWrappedQueue::EndCollection() { ScopeLock lock(&m_workerThreadInfoMutex); for (UINT i = 0; i < m_workerThreadInfo.size(); i++) { // Delete profiler memory for (UINT j = 0; j < m_workerThreadInfo[i]->m_inputs.cmdBufs.size(); j++) { VktWrappedCmdBuf* pCmdBuf = m_workerThreadInfo[i]->m_inputs.cmdBufs[j]; if (pCmdBuf != nullptr) { pCmdBuf->DestroyDynamicProfilers(); } } // Free the fence we created earlier if (m_workerThreadInfo[i]->m_inputs.internalFence) { device_dispatch_table(m_createInfo.device)->DestroyFence(m_createInfo.device, m_workerThreadInfo[i]->m_inputs.fenceToWaitOn, nullptr); } m_workerThreadInfo[i]->m_outputs.results.clear(); CloseHandle(m_workerThreadInfo[i]->m_threadInfo.threadHandle); SAFE_DELETE(m_workerThreadInfo[i]); } m_workerThreadInfo.clear(); }
//----------------------------------------------------------------------------- /// Profiler results collection worker function. /// \param lpParam A void pointer to the incoming VktWorkerInfo argument. /// \returns Always 0. //----------------------------------------------------------------------------- DWORD WINAPI ThreadFunc(LPVOID lpParam) { VktWorkerInfo* pWorkerInfo = (VktWorkerInfo*)lpParam; pWorkerInfo->m_threadInfo.workerThreadID = osGetCurrentThreadId(); VkResult waitResult = VK_TIMEOUT; #if GPU_FENCES_FOR_PROFILER_WAIT VkDevice device = pWorkerInfo->m_inputs.pQueue->ParentDevice(); do { waitResult = device_dispatch_table(device)->WaitForFences(device, 1, &pWorkerInfo->m_inputs.fenceToWaitOn, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); #else VkQueue queue = pWorkerInfo->m_inputs.pQueue->AppHandle(); waitResult = device_dispatch_table(queue)->QueueWaitIdle(queue); #endif if (pWorkerInfo->m_inputs.timestampPair.mQueueCanBeTimestamped) { for (UINT i = 0; i < pWorkerInfo->m_inputs.cmdBufs.size(); i++) { VktWrappedCmdBuf* pWrappedCmdBuf = pWorkerInfo->m_inputs.cmdBufs[i]; ProfilerResultCode profResult = pWrappedCmdBuf->GetCmdBufResultsMT(pWorkerInfo->m_inputs.executionID, pWorkerInfo->m_outputs.results); if (profResult != PROFILER_SUCCESS) { const char* profilerErrorCode = VktCmdBufProfiler::PrintProfilerResult(profResult); // Report that a problem occurred in retrieving full profiler results. Log(logERROR, "Failed to retrieve full profiler results: CmdBuf 0x%p, Queue 0x%p, ErrorCode %s\n", pWorkerInfo->m_inputs.cmdBufs[i], pWorkerInfo->m_inputs.pQueue, profilerErrorCode); } } } // This will only be set to true if the GPU results have come back in time. pWorkerInfo->m_outputs.bResultsGathered = true; return 0; }
VkResult VktWrappedQueue::QueueWaitIdle(VkQueue queue) { const FuncId funcId = FuncId_vkQueueWaitIdle; VkResult result = VK_INCOMPLETE; if (m_createInfo.pInterceptMgr->ShouldCollectTrace()) { char argumentsBuffer[ARGUMENTS_BUFFER_SIZE]; sprintf_s(argumentsBuffer, ARGUMENTS_BUFFER_SIZE, "0x%p", queue); VktAPIEntry* pNewEntry = m_createInfo.pInterceptMgr->PreCall(funcId, argumentsBuffer); result = device_dispatch_table(queue)->QueueWaitIdle(queue); m_createInfo.pInterceptMgr->PostCall(pNewEntry, result); } else { result = device_dispatch_table(queue)->QueueWaitIdle(queue); } return result; }
//----------------------------------------------------------------------------- /// Profiler results collection worker function. /// \param lpParam A void pointer to the incoming VktWorkerInfo argument. /// \returns Always 0. //----------------------------------------------------------------------------- DWORD WINAPI ThreadFunc(LPVOID lpParam) { VktWorkerInfo* pWorkerInfo = (VktWorkerInfo*)lpParam; pWorkerInfo->m_threadInfo.workerThreadID = osGetCurrentThreadId(); VkResult waitResult = VK_TIMEOUT; #if GPU_FENCES_FOR_PROFILER_WAIT VkDevice device = pWorkerInfo->m_inputs.pQueue->ParentDevice(); do { waitResult = device_dispatch_table(device)->WaitForFences(device, 1, &pWorkerInfo->m_inputs.fenceToWaitOn, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); #else VkQueue queue = pWorkerInfo->m_inputs.pQueue->AppHandle(); waitResult = device_dispatch_table(queue)->QueueWaitIdle(queue); #endif for (UINT i = 0; i < pWorkerInfo->m_inputs.cmdBufData.size(); i++) { VktWrappedCmdBuf* pWrappedCmdBuf = pWorkerInfo->m_inputs.cmdBufData[i].pCmdBuf; UINT64 targetFillId = pWorkerInfo->m_inputs.cmdBufData[i].targetFillID; UINT profiledCallCount = pWorkerInfo->m_inputs.cmdBufData[i].profiledCallCount; ProfilerResultCode profResult = pWrappedCmdBuf->GetCmdBufResultsMT(targetFillId, profiledCallCount, pWorkerInfo->m_outputs.results); if (profResult != PROFILER_SUCCESS) { const char* profilerErrorCode = VktCmdBufProfiler::PrintProfilerResult(profResult); // Report that a problem occurred in retrieving full profiler results. Log(logERROR, "Failed to retrieve full profiler results: CmdBuf 0x%p, Queue 0x%p, ErrorCode %s\n", pWorkerInfo->m_inputs.cmdBufData[i].pCmdBuf, pWorkerInfo->m_inputs.pQueue, profilerErrorCode); } } return 0; }
VkResult VktWrappedQueue::QueuePresentKHR_ICD(VkQueue queue, const VkPresentInfoKHR* pPresentInfo) { return device_dispatch_table(queue)->QueuePresentKHR(queue, pPresentInfo); }
//----------------------------------------------------------------------------- /// Submit command buffers and gather results. /// \param queue The queue issued work to. /// \param submitCount The number of submits. /// \param pSubmits The submit info structures. /// \param fence The fence wrapping this submit. //----------------------------------------------------------------------------- VkResult VktWrappedQueue::QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence) { m_executionID++; VkResult result = VK_INCOMPLETE; VktTraceAnalyzerLayer* pTraceAnalyzer = VktTraceAnalyzerLayer::Instance(); VktFrameProfilerLayer* pFrameProfiler = VktFrameProfilerLayer::Instance(); // Use this calibration timestamp structure to convert GPU events to the CPU timeline. CalibrationTimestampPair calibrationTimestamps = {}; calibrationTimestamps.mQueueCanBeTimestamped = true; VkFence fenceToWaitOn = fence; bool usingInternalFence = false; std::vector<VktWrappedCmdBuf*> wrappedCmdBufs; GatherWrappedCommandBufs(submitCount, pSubmits, wrappedCmdBufs); for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { wrappedCmdBufs[i]->SetProfilerExecutionId(m_executionID); wrappedCmdBufs[i]->IncrementSubmitCount(); } // Surround the execution of CommandBuffers with timestamps so we can determine when the GPU work occurred in the CPU timeline. if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime()) { // Collect calibration timestamps in case we need to align GPU events against the CPU timeline. if (calibrationTimestamps.mQueueCanBeTimestamped) { pFrameProfiler->CollectCalibrationTimestamps(this, &calibrationTimestamps); } else { Log(logTRACE, "Did not collect calibration timestamps for Queue '0x%p'\n", this); } // Inject our own fence if the app did not supply one if (fenceToWaitOn == VK_NULL_HANDLE) { // Create internal fence VkFenceCreateInfo fenceCreateInfo = {}; VkResult fenceResult = VK_INCOMPLETE; fenceResult = device_dispatch_table(queue)->CreateFence(m_createInfo.device, &fenceCreateInfo, nullptr, &fenceToWaitOn); VKT_ASSERT(fenceResult == VK_SUCCESS); usingInternalFence = true; } } // Invoke the real call to execute on the GPU result = QueueSubmit_ICD(queue, submitCount, pSubmits, fenceToWaitOn); if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime()) { // Collect the CPU and GPU frequency to convert timestamps. QueryPerformanceFrequency(&calibrationTimestamps.cpuFrequency); #if GATHER_PROFILER_RESULTS_WITH_WORKERS SpawnWorker(&calibrationTimestamps, this, fenceToWaitOn, usingInternalFence, wrappedCmdBufs); #else VkResult waitResult = VK_TIMEOUT; #if GPU_FENCES_FOR_PROFILER_WAIT do { waitResult = device_dispatch_table(m_createInfo.device)->WaitForFences(m_createInfo.device, 1, &fenceToWaitOn, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); #else waitResult = device_dispatch_table(queue)->QueueWaitIdle(queue); #endif if (calibrationTimestamps.mQueueCanBeTimestamped) { // Put all results into thread ID 0 bucket const UINT32 threadID = 0; std::vector<ProfilerResult> results; for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { ProfilerResultCode getResultsResult = PROFILER_FAIL; getResultsResult = wrappedCmdBufs[i]->GetCmdBufResultsST(results); VKT_ASSERT(getResultsResult != PROFILER_FAIL); } pFrameProfiler->VerifyAlignAndStoreResults(this, results, &calibrationTimestamps, threadID, VktTraceAnalyzerLayer::Instance()->GetFrameStartTime()); // Free the fence we created earlier if (usingInternalFence) { device_dispatch_table(m_createInfo.device)->DestroyFence(m_createInfo.device, fenceToWaitOn, nullptr); } } else { Log(logTRACE, "Didn't collect calibration timestamps for Queue '0x%p'.\n", this); } #endif } #if GATHER_PROFILER_RESULTS_WITH_WORKERS == 0 for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { wrappedCmdBufs[i]->DestroyDynamicProfilers(); } #endif return result; }
//----------------------------------------------------------------------------- /// Collect and store calibration timestamps from the CPU and GPU to align execution results in a single timeline. /// \param pWrappedQueue The Queue responsible for work submission. /// \param pTimestamps The timestamps structure used to hold timestamps occurring before and after workload execution. //----------------------------------------------------------------------------- VkResult VktFrameProfilerLayer::CollectCalibrationTimestamps(VktWrappedQueue* pWrappedQueue, CalibrationTimestampPair* pTimestamps) { VkResult result = VK_INCOMPLETE; #if MANUAL_TIMESTAMP_CALIBRATION if ((pWrappedQueue != nullptr) && (pTimestamps != nullptr)) { VkQueue queue = pWrappedQueue->AppHandle(); VkDevice device = pWrappedQueue->ParentDevice(); TimestampedCmdBufConfig config = {}; config.device = device; config.physicalDevice = pWrappedQueue->PhysicalDevice(); config.mapTimestampMem = false; config.pipelineLoc = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; config.queueFamilyIndex = pWrappedQueue->GetQueueFamilyIndex(); VktTimestampedCmdBuf* pTimestampedCmdBuf = VktTimestampedCmdBuf::Create(config); if (pTimestampedCmdBuf != nullptr) { const VkCommandBuffer cmdBufs[] = { pTimestampedCmdBuf->CmdBufHandle() }; VkSubmitInfo submitInfo = {}; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = nullptr; submitInfo.waitSemaphoreCount = 0; submitInfo.pWaitSemaphores = nullptr; submitInfo.pWaitDstStageMask = nullptr; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = cmdBufs; submitInfo.signalSemaphoreCount = 0; submitInfo.pSignalSemaphores = nullptr; VkFence fence = VK_NULL_HANDLE; VkFenceCreateInfo fenceCreateInfo = {}; result = device_dispatch_table(queue)->CreateFence(device, &fenceCreateInfo, nullptr, &fence); if (result == VK_SUCCESS) { LARGE_INTEGER largeInt = {}; result = pWrappedQueue->QueueSubmit_ICD(queue, 1, &submitInfo, fence); VKT_ASSERT(result == VK_SUCCESS); VkResult waitResult = VK_TIMEOUT; do { waitResult = device_dispatch_table(device)->WaitForFences(device, 1, &fence, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); // Fetch the GPU counter pTimestampedCmdBuf->GetTimestampResult(&pTimestamps->mBeforeExecutionGPUTimestamp); #ifdef WIN32 // Immediately after, fetch the CPU counter QueryPerformanceCounter(&largeInt); pTimestamps->mBeforeExecutionCPUTimestamp = largeInt.QuadPart; #endif pTimestamps->mQueueFrequency = (UINT64)pWrappedQueue->GetTimestampFrequency(); device_dispatch_table(device)->DestroyFence(device, fence, nullptr); } delete pTimestampedCmdBuf; pTimestampedCmdBuf = nullptr; } } #else UNREFERENCED_PARAMETER(pWrappedQueue); UNREFERENCED_PARAMETER(pTimestamps); #endif return result; }
/* hook DestroyDevice to remove tableMap entry */ VK_LAYER_EXPORT VKAPI_ATTR void VKAPI_CALL basic_DestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator) { dispatch_key key = get_dispatch_key(device); device_dispatch_table(device)->DestroyDevice(device, pAllocator); destroy_device_dispatch_table(key); }