//----------------------------------------------------------------------------- /// Begin profiling a GPU command. /// \param pIdInfo The identifying metadata for the new measurement. /// \returns A profiler result code indicating measurement success. //----------------------------------------------------------------------------- ProfilerResultCode VktCmdBufProfiler::BeginCmdMeasurement(const ProfilerMeasurementId* pIdInfo) { ProfilerResultCode profilerResultCode = PROFILER_FAIL; if (m_cmdBufData.state != PROFILER_STATE_MEASUREMENT_BEGAN) { const UINT measurementId = m_cmdBufData.cmdBufMeasurementCount % m_config.measurementsPerGroup; // Create new measurement group if full if (measurementId == 0) { VkResult result = VK_INCOMPLETE; result = SetupNewMeasurementGroup(); VKT_ASSERT(result == VK_SUCCESS); if (result != VK_SUCCESS) { Log(logERROR, "SetupNewMeasurementGroup() failed in VktCmdBufProfiler::BeginCmdMeasurement()\n"); } m_pDeviceDT->CmdResetQueryPool(m_config.cmdBuf, m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool, 0, m_maxQueriesPerGroup); } if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS) { const UINT offset = measurementId * ProfilerTimestampsPerMeasurement; // Inject timestamp m_pDeviceDT->CmdWriteTimestamp( m_config.cmdBuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool, offset); // Inject timestamp m_pDeviceDT->CmdWriteTimestamp( m_config.cmdBuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool, offset + 1); } m_cmdBufData.cmdBufMeasurementCount++; // Add a new measurement ProfilerMeasurementInfo clientData = ProfilerMeasurementInfo(); clientData.measurementNum = m_cmdBufData.cmdBufMeasurementCount; if (pIdInfo != nullptr) { memcpy(&clientData.idInfo, pIdInfo, sizeof(ProfilerMeasurementId)); } m_cmdBufData.pActiveMeasurementGroup->measurementInfos.push_back(clientData); m_cmdBufData.pActiveMeasurementGroup->groupMeasurementCount++; m_cmdBufData.state = PROFILER_STATE_MEASUREMENT_BEGAN; profilerResultCode = PROFILER_SUCCESS; } else { profilerResultCode = PROFILER_ERROR_MEASUREMENT_ALREADY_BEGAN; VKT_ASSERT_ALWAYS(); } return profilerResultCode; }
//----------------------------------------------------------------------------- /// Notify the profiler that this command buffer was manually reset. //----------------------------------------------------------------------------- void VktCmdBufProfiler::NotifyCmdBufReset() { VKT_ASSERT((m_cmdBufData.state == PROFILER_STATE_INIT) || (m_cmdBufData.state == PROFILER_STATE_CMD_BUF_CLOSED)); }
//----------------------------------------------------------------------------- /// Submit command buffers and gather results. /// \param queue The queue issued work to. /// \param submitCount The number of submits. /// \param pSubmits The submit info structures. /// \param fence The fence wrapping this submit. //----------------------------------------------------------------------------- VkResult VktWrappedQueue::QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence) { m_executionID++; VkResult result = VK_INCOMPLETE; VktTraceAnalyzerLayer* pTraceAnalyzer = VktTraceAnalyzerLayer::Instance(); VktFrameProfilerLayer* pFrameProfiler = VktFrameProfilerLayer::Instance(); // Use this calibration timestamp structure to convert GPU events to the CPU timeline. CalibrationTimestampPair calibrationTimestamps = {}; calibrationTimestamps.mQueueCanBeTimestamped = true; VkFence fenceToWaitOn = fence; bool usingInternalFence = false; std::vector<VktWrappedCmdBuf*> wrappedCmdBufs; GatherWrappedCommandBufs(submitCount, pSubmits, wrappedCmdBufs); for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { wrappedCmdBufs[i]->SetProfilerExecutionId(m_executionID); wrappedCmdBufs[i]->IncrementSubmitCount(); } // Surround the execution of CommandBuffers with timestamps so we can determine when the GPU work occurred in the CPU timeline. if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime()) { // Collect calibration timestamps in case we need to align GPU events against the CPU timeline. if (calibrationTimestamps.mQueueCanBeTimestamped) { pFrameProfiler->CollectCalibrationTimestamps(this, &calibrationTimestamps); } else { Log(logTRACE, "Did not collect calibration timestamps for Queue '0x%p'\n", this); } // Inject our own fence if the app did not supply one if (fenceToWaitOn == VK_NULL_HANDLE) { // Create internal fence VkFenceCreateInfo fenceCreateInfo = {}; VkResult fenceResult = VK_INCOMPLETE; fenceResult = device_dispatch_table(queue)->CreateFence(m_createInfo.device, &fenceCreateInfo, nullptr, &fenceToWaitOn); VKT_ASSERT(fenceResult == VK_SUCCESS); usingInternalFence = true; } } // Invoke the real call to execute on the GPU result = QueueSubmit_ICD(queue, submitCount, pSubmits, fenceToWaitOn); if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime()) { // Collect the CPU and GPU frequency to convert timestamps. QueryPerformanceFrequency(&calibrationTimestamps.cpuFrequency); #if GATHER_PROFILER_RESULTS_WITH_WORKERS SpawnWorker(&calibrationTimestamps, this, fenceToWaitOn, usingInternalFence, wrappedCmdBufs); #else VkResult waitResult = VK_TIMEOUT; #if GPU_FENCES_FOR_PROFILER_WAIT do { waitResult = device_dispatch_table(m_createInfo.device)->WaitForFences(m_createInfo.device, 1, &fenceToWaitOn, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); #else waitResult = device_dispatch_table(queue)->QueueWaitIdle(queue); #endif if (calibrationTimestamps.mQueueCanBeTimestamped) { // Put all results into thread ID 0 bucket const UINT32 threadID = 0; std::vector<ProfilerResult> results; for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { ProfilerResultCode getResultsResult = PROFILER_FAIL; getResultsResult = wrappedCmdBufs[i]->GetCmdBufResultsST(results); VKT_ASSERT(getResultsResult != PROFILER_FAIL); } pFrameProfiler->VerifyAlignAndStoreResults(this, results, &calibrationTimestamps, threadID, VktTraceAnalyzerLayer::Instance()->GetFrameStartTime()); // Free the fence we created earlier if (usingInternalFence) { device_dispatch_table(m_createInfo.device)->DestroyFence(m_createInfo.device, fenceToWaitOn, nullptr); } } else { Log(logTRACE, "Didn't collect calibration timestamps for Queue '0x%p'.\n", this); } #endif } #if GATHER_PROFILER_RESULTS_WITH_WORKERS == 0 for (UINT i = 0; i < wrappedCmdBufs.size(); i++) { wrappedCmdBufs[i]->DestroyDynamicProfilers(); } #endif return result; }
//----------------------------------------------------------------------------- /// We assume this will be called immediately after a command buffer has been submitted. /// \param results A vector containing performance information for a given function. /// \returns A code with the result of collecting profiler results for the CommandBuffer. //----------------------------------------------------------------------------- ProfilerResultCode VktCmdBufProfiler::GetCmdBufResults(std::vector<ProfilerResult>& results) { ScopeLock lock(&m_mutex); ProfilerResultCode profilerResultCode = PROFILER_THIS_CMD_BUF_WAS_NOT_CLOSED; VkResult result = VK_INCOMPLETE; if (m_cmdBufData.state == PROFILER_STATE_CMD_BUF_CLOSED) { profilerResultCode = PROFILER_THIS_CMD_BUF_WAS_NOT_MEASURED; // Loop through all measurements for this command buffer for (UINT i = 0; i < m_cmdBufData.measurementGroups.size(); i++) { ProfilerMeasurementGroup& currGroup = m_cmdBufData.measurementGroups[i]; ProfilerInterval* pTimestampData = nullptr; if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS) { // We use vkCmdCopyQueryPoolResults if (m_config.mapTimestampMem == true) { result = m_pDeviceDT->MapMemory( m_config.device, currGroup.gpuRes.timestampMem, 0, VK_WHOLE_SIZE, 0, (void**)&pTimestampData); } // We use vkGetQueryPoolResults else { pTimestampData = new ProfilerInterval[currGroup.groupMeasurementCount](); result = m_pDeviceDT->GetQueryPoolResults( m_config.device, currGroup.gpuRes.timestampQueryPool, 0, currGroup.groupMeasurementCount * ProfilerTimestampsPerMeasurement, currGroup.groupMeasurementCount * sizeof(ProfilerInterval), pTimestampData, sizeof(UINT64), VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); } } // Report no results if (m_config.measurementTypeFlags == PROFILER_MEASUREMENT_TYPE_NONE) { for (UINT j = 0; j < currGroup.groupMeasurementCount; j++) { ProfilerResult profilerResult = ProfilerResult(); results.push_back(profilerResult); } } // Fetch our results else { profilerResultCode = PROFILER_SUCCESS; for (UINT j = 0; j < currGroup.groupMeasurementCount; j++) { ProfilerResult profilerResult = ProfilerResult(); memcpy(&profilerResult.measurementInfo, &currGroup.measurementInfos[j], sizeof(ProfilerMeasurementInfo)); if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS) { UINT64* pTimerPreBegin = &pTimestampData[j].preStart; UINT64* pTimerBegin = &pTimestampData[j].start; UINT64* pTimerEnd = &pTimestampData[j].end; UINT64 baseClock = pTimestampData[0].start; // Store raw clocks profilerResult.timestampResult.rawClocks.preStart = *pTimerPreBegin; profilerResult.timestampResult.rawClocks.start = *pTimerBegin; profilerResult.timestampResult.rawClocks.end = *pTimerEnd; // Calculate adjusted clocks profilerResult.timestampResult.adjustedClocks.preStart = 0; profilerResult.timestampResult.adjustedClocks.start = *pTimerBegin - baseClock; profilerResult.timestampResult.adjustedClocks.end = *pTimerEnd - baseClock; // Calculate exec time profilerResult.timestampResult.execMicroSecs = static_cast<double>(*pTimerEnd - *pTimerBegin) / m_gpuTimestampFreq; profilerResult.timestampResult.execMicroSecs *= 1000000; // Detected a zero timestamp. Allow this and continue, but some results are invalid. VKT_ASSERT((*pTimerPreBegin != 0ULL) && (*pTimerBegin != 0ULL) && (*pTimerEnd != 0ULL)); } results.push_back(profilerResult); } } if (pTimestampData != nullptr) { if (m_config.mapTimestampMem == true) { m_pDeviceDT->UnmapMemory(m_config.device, currGroup.gpuRes.timestampMem); } else { delete [] pTimestampData; pTimestampData = nullptr; } } } } // We're done profiling this command buffer, so reset and we'll start over next time. result = ResetProfilerState(); VKT_ASSERT(result == VK_SUCCESS); if (result != VK_SUCCESS) { Log(logERROR, "ResetProfilerState() failed in VktCmdBufProfiler::GetCmdBufResults()\n"); } return profilerResultCode; }
//----------------------------------------------------------------------------- /// Collect and store calibration timestamps from the CPU and GPU to align execution results in a single timeline. /// \param pWrappedQueue The Queue responsible for work submission. /// \param pTimestamps The timestamps structure used to hold timestamps occurring before and after workload execution. //----------------------------------------------------------------------------- VkResult VktFrameProfilerLayer::CollectCalibrationTimestamps(VktWrappedQueue* pWrappedQueue, CalibrationTimestampPair* pTimestamps) { VkResult result = VK_INCOMPLETE; #if MANUAL_TIMESTAMP_CALIBRATION if ((pWrappedQueue != nullptr) && (pTimestamps != nullptr)) { VkQueue queue = pWrappedQueue->AppHandle(); VkDevice device = pWrappedQueue->ParentDevice(); TimestampedCmdBufConfig config = {}; config.device = device; config.physicalDevice = pWrappedQueue->PhysicalDevice(); config.mapTimestampMem = false; config.pipelineLoc = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; config.queueFamilyIndex = pWrappedQueue->GetQueueFamilyIndex(); VktTimestampedCmdBuf* pTimestampedCmdBuf = VktTimestampedCmdBuf::Create(config); if (pTimestampedCmdBuf != nullptr) { const VkCommandBuffer cmdBufs[] = { pTimestampedCmdBuf->CmdBufHandle() }; VkSubmitInfo submitInfo = {}; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.pNext = nullptr; submitInfo.waitSemaphoreCount = 0; submitInfo.pWaitSemaphores = nullptr; submitInfo.pWaitDstStageMask = nullptr; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = cmdBufs; submitInfo.signalSemaphoreCount = 0; submitInfo.pSignalSemaphores = nullptr; VkFence fence = VK_NULL_HANDLE; VkFenceCreateInfo fenceCreateInfo = {}; result = device_dispatch_table(queue)->CreateFence(device, &fenceCreateInfo, nullptr, &fence); if (result == VK_SUCCESS) { LARGE_INTEGER largeInt = {}; result = pWrappedQueue->QueueSubmit_ICD(queue, 1, &submitInfo, fence); VKT_ASSERT(result == VK_SUCCESS); VkResult waitResult = VK_TIMEOUT; do { waitResult = device_dispatch_table(device)->WaitForFences(device, 1, &fence, VK_TRUE, GPU_FENCE_TIMEOUT_TIME); } while (waitResult == VK_TIMEOUT); // Fetch the GPU counter pTimestampedCmdBuf->GetTimestampResult(&pTimestamps->mBeforeExecutionGPUTimestamp); #ifdef WIN32 // Immediately after, fetch the CPU counter QueryPerformanceCounter(&largeInt); pTimestamps->mBeforeExecutionCPUTimestamp = largeInt.QuadPart; #endif pTimestamps->mQueueFrequency = (UINT64)pWrappedQueue->GetTimestampFrequency(); device_dispatch_table(device)->DestroyFence(device, fence, nullptr); } delete pTimestampedCmdBuf; pTimestampedCmdBuf = nullptr; } } #else UNREFERENCED_PARAMETER(pWrappedQueue); UNREFERENCED_PARAMETER(pTimestamps); #endif return result; }