Ejemplo n.º 1
0
//-----------------------------------------------------------------------------
/// Begin profiling a GPU command.
/// \param pIdInfo The identifying metadata for the new measurement.
/// \returns A profiler result code indicating measurement success.
//-----------------------------------------------------------------------------
ProfilerResultCode VktCmdBufProfiler::BeginCmdMeasurement(const ProfilerMeasurementId* pIdInfo)
{
    ProfilerResultCode profilerResultCode = PROFILER_FAIL;

    if (m_cmdBufData.state != PROFILER_STATE_MEASUREMENT_BEGAN)
    {
        const UINT measurementId = m_cmdBufData.cmdBufMeasurementCount % m_config.measurementsPerGroup;

        // Create new measurement group if full
        if (measurementId == 0)
        {
            VkResult result = VK_INCOMPLETE;
            result = SetupNewMeasurementGroup();
            VKT_ASSERT(result == VK_SUCCESS);

            if (result != VK_SUCCESS)
            {
                Log(logERROR, "SetupNewMeasurementGroup() failed in VktCmdBufProfiler::BeginCmdMeasurement()\n");
            }

            m_pDeviceDT->CmdResetQueryPool(m_config.cmdBuf, m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool, 0, m_maxQueriesPerGroup);
        }

        if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS)
        {
            const UINT offset = measurementId * ProfilerTimestampsPerMeasurement;

            // Inject timestamp
            m_pDeviceDT->CmdWriteTimestamp(
                m_config.cmdBuf,
                VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool,
                offset);

            // Inject timestamp
            m_pDeviceDT->CmdWriteTimestamp(
                m_config.cmdBuf,
                VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
                m_cmdBufData.pActiveMeasurementGroup->gpuRes.timestampQueryPool,
                offset + 1);
        }

        m_cmdBufData.cmdBufMeasurementCount++;

        // Add a new measurement
        ProfilerMeasurementInfo clientData = ProfilerMeasurementInfo();
        clientData.measurementNum = m_cmdBufData.cmdBufMeasurementCount;

        if (pIdInfo != nullptr)
        {
            memcpy(&clientData.idInfo, pIdInfo, sizeof(ProfilerMeasurementId));
        }

        m_cmdBufData.pActiveMeasurementGroup->measurementInfos.push_back(clientData);

        m_cmdBufData.pActiveMeasurementGroup->groupMeasurementCount++;

        m_cmdBufData.state = PROFILER_STATE_MEASUREMENT_BEGAN;

        profilerResultCode = PROFILER_SUCCESS;
    }
    else
    {
        profilerResultCode = PROFILER_ERROR_MEASUREMENT_ALREADY_BEGAN;

        VKT_ASSERT_ALWAYS();
    }

    return profilerResultCode;
}
Ejemplo n.º 2
0
//-----------------------------------------------------------------------------
/// Notify the profiler that this command buffer was manually reset.
//-----------------------------------------------------------------------------
void VktCmdBufProfiler::NotifyCmdBufReset()
{
    VKT_ASSERT((m_cmdBufData.state == PROFILER_STATE_INIT) || (m_cmdBufData.state == PROFILER_STATE_CMD_BUF_CLOSED));
}
Ejemplo n.º 3
0
//-----------------------------------------------------------------------------
/// Submit command buffers and gather results.
/// \param queue The queue issued work to.
/// \param submitCount The number of submits.
/// \param pSubmits The submit info structures.
/// \param fence The fence wrapping this submit.
//-----------------------------------------------------------------------------
VkResult VktWrappedQueue::QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence)
{
    m_executionID++;

    VkResult result = VK_INCOMPLETE;

    VktTraceAnalyzerLayer* pTraceAnalyzer = VktTraceAnalyzerLayer::Instance();
    VktFrameProfilerLayer* pFrameProfiler = VktFrameProfilerLayer::Instance();

    // Use this calibration timestamp structure to convert GPU events to the CPU timeline.
    CalibrationTimestampPair calibrationTimestamps = {};
    calibrationTimestamps.mQueueCanBeTimestamped = true;

    VkFence fenceToWaitOn = fence;
    bool usingInternalFence = false;

    std::vector<VktWrappedCmdBuf*> wrappedCmdBufs;
    GatherWrappedCommandBufs(submitCount, pSubmits, wrappedCmdBufs);

    for (UINT i = 0; i < wrappedCmdBufs.size(); i++)
    {
        wrappedCmdBufs[i]->SetProfilerExecutionId(m_executionID);
        wrappedCmdBufs[i]->IncrementSubmitCount();
    }

    // Surround the execution of CommandBuffers with timestamps so we can determine when the GPU work occurred in the CPU timeline.
    if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime())
    {
        // Collect calibration timestamps in case we need to align GPU events against the CPU timeline.
        if (calibrationTimestamps.mQueueCanBeTimestamped)
        {
            pFrameProfiler->CollectCalibrationTimestamps(this, &calibrationTimestamps);
        }
        else
        {
            Log(logTRACE, "Did not collect calibration timestamps for Queue '0x%p'\n", this);
        }

        // Inject our own fence if the app did not supply one
        if (fenceToWaitOn == VK_NULL_HANDLE)
        {
            // Create internal fence
            VkFenceCreateInfo fenceCreateInfo = {};
            VkResult fenceResult = VK_INCOMPLETE;
            fenceResult = device_dispatch_table(queue)->CreateFence(m_createInfo.device, &fenceCreateInfo, nullptr, &fenceToWaitOn);
            VKT_ASSERT(fenceResult == VK_SUCCESS);

            usingInternalFence = true;
        }
    }

    // Invoke the real call to execute on the GPU
    result = QueueSubmit_ICD(queue, submitCount, pSubmits, fenceToWaitOn);

    if (pTraceAnalyzer->ShouldCollectTrace() && pFrameProfiler->ShouldCollectGPUTime())
    {
        // Collect the CPU and GPU frequency to convert timestamps.
        QueryPerformanceFrequency(&calibrationTimestamps.cpuFrequency);

#if GATHER_PROFILER_RESULTS_WITH_WORKERS
        SpawnWorker(&calibrationTimestamps, this, fenceToWaitOn, usingInternalFence, wrappedCmdBufs);
#else
        VkResult waitResult = VK_TIMEOUT;

#if GPU_FENCES_FOR_PROFILER_WAIT
        do
        {
            waitResult = device_dispatch_table(m_createInfo.device)->WaitForFences(m_createInfo.device, 1, &fenceToWaitOn, VK_TRUE, GPU_FENCE_TIMEOUT_TIME);
        } while (waitResult == VK_TIMEOUT);
#else
        waitResult = device_dispatch_table(queue)->QueueWaitIdle(queue);
#endif

        if (calibrationTimestamps.mQueueCanBeTimestamped)
        {
            // Put all results into thread ID 0 bucket
            const UINT32 threadID = 0;
            std::vector<ProfilerResult> results;

            for (UINT i = 0; i < wrappedCmdBufs.size(); i++)
            {
                ProfilerResultCode getResultsResult = PROFILER_FAIL;
                getResultsResult = wrappedCmdBufs[i]->GetCmdBufResultsST(results);
                VKT_ASSERT(getResultsResult != PROFILER_FAIL);
            }

            pFrameProfiler->VerifyAlignAndStoreResults(this, results, &calibrationTimestamps, threadID, VktTraceAnalyzerLayer::Instance()->GetFrameStartTime());

            // Free the fence we created earlier
            if (usingInternalFence)
            {
                device_dispatch_table(m_createInfo.device)->DestroyFence(m_createInfo.device, fenceToWaitOn, nullptr);
            }
        }
        else
        {
            Log(logTRACE, "Didn't collect calibration timestamps for Queue '0x%p'.\n", this);
        }
#endif
    }

#if GATHER_PROFILER_RESULTS_WITH_WORKERS == 0
    for (UINT i = 0; i < wrappedCmdBufs.size(); i++)
    {
        wrappedCmdBufs[i]->DestroyDynamicProfilers();
    }
#endif

    return result;
}
Ejemplo n.º 4
0
//-----------------------------------------------------------------------------
/// We assume this will be called immediately after a command buffer has been submitted.
/// \param results A vector containing performance information for a given function.
/// \returns A code with the result of collecting profiler results for the CommandBuffer.
//-----------------------------------------------------------------------------
ProfilerResultCode VktCmdBufProfiler::GetCmdBufResults(std::vector<ProfilerResult>& results)
{
    ScopeLock lock(&m_mutex);

    ProfilerResultCode profilerResultCode = PROFILER_THIS_CMD_BUF_WAS_NOT_CLOSED;

    VkResult result = VK_INCOMPLETE;

    if (m_cmdBufData.state == PROFILER_STATE_CMD_BUF_CLOSED)
    {
        profilerResultCode = PROFILER_THIS_CMD_BUF_WAS_NOT_MEASURED;

        // Loop through all measurements for this command buffer
        for (UINT i = 0; i < m_cmdBufData.measurementGroups.size(); i++)
        {
            ProfilerMeasurementGroup& currGroup = m_cmdBufData.measurementGroups[i];

            ProfilerInterval* pTimestampData = nullptr;

            if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS)
            {
                // We use vkCmdCopyQueryPoolResults
                if (m_config.mapTimestampMem == true)
                {
                    result = m_pDeviceDT->MapMemory(
                                 m_config.device,
                                 currGroup.gpuRes.timestampMem,
                                 0,
                                 VK_WHOLE_SIZE,
                                 0,
                                 (void**)&pTimestampData);
                }

                // We use vkGetQueryPoolResults
                else
                {
                    pTimestampData = new ProfilerInterval[currGroup.groupMeasurementCount]();

                    result = m_pDeviceDT->GetQueryPoolResults(
                                 m_config.device,
                                 currGroup.gpuRes.timestampQueryPool,
                                 0,
                                 currGroup.groupMeasurementCount * ProfilerTimestampsPerMeasurement,
                                 currGroup.groupMeasurementCount * sizeof(ProfilerInterval),
                                 pTimestampData,
                                 sizeof(UINT64),
                                 VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
                }
            }

            // Report no results
            if (m_config.measurementTypeFlags == PROFILER_MEASUREMENT_TYPE_NONE)
            {
                for (UINT j = 0; j < currGroup.groupMeasurementCount; j++)
                {
                    ProfilerResult profilerResult = ProfilerResult();
                    results.push_back(profilerResult);
                }
            }

            // Fetch our results
            else
            {
                profilerResultCode = PROFILER_SUCCESS;

                for (UINT j = 0; j < currGroup.groupMeasurementCount; j++)
                {
                    ProfilerResult profilerResult = ProfilerResult();

                    memcpy(&profilerResult.measurementInfo, &currGroup.measurementInfos[j], sizeof(ProfilerMeasurementInfo));

                    if (m_config.measurementTypeFlags & PROFILER_MEASUREMENT_TYPE_TIMESTAMPS)
                    {
                        UINT64* pTimerPreBegin = &pTimestampData[j].preStart;
                        UINT64* pTimerBegin = &pTimestampData[j].start;
                        UINT64* pTimerEnd = &pTimestampData[j].end;
                        UINT64 baseClock = pTimestampData[0].start;

                        // Store raw clocks
                        profilerResult.timestampResult.rawClocks.preStart = *pTimerPreBegin;
                        profilerResult.timestampResult.rawClocks.start = *pTimerBegin;
                        profilerResult.timestampResult.rawClocks.end = *pTimerEnd;

                        // Calculate adjusted clocks
                        profilerResult.timestampResult.adjustedClocks.preStart = 0;
                        profilerResult.timestampResult.adjustedClocks.start = *pTimerBegin - baseClock;
                        profilerResult.timestampResult.adjustedClocks.end = *pTimerEnd - baseClock;

                        // Calculate exec time
                        profilerResult.timestampResult.execMicroSecs = static_cast<double>(*pTimerEnd - *pTimerBegin) / m_gpuTimestampFreq;
                        profilerResult.timestampResult.execMicroSecs *= 1000000;

                        // Detected a zero timestamp. Allow this and continue, but some results are invalid.
                        VKT_ASSERT((*pTimerPreBegin != 0ULL) && (*pTimerBegin != 0ULL) && (*pTimerEnd != 0ULL));
                    }

                    results.push_back(profilerResult);
                }
            }

            if (pTimestampData != nullptr)
            {
                if (m_config.mapTimestampMem == true)
                {
                    m_pDeviceDT->UnmapMemory(m_config.device, currGroup.gpuRes.timestampMem);
                }
                else
                {
                    delete [] pTimestampData;
                    pTimestampData = nullptr;
                }
            }
        }
    }

    // We're done profiling this command buffer, so reset and we'll start over next time.
    result = ResetProfilerState();
    VKT_ASSERT(result == VK_SUCCESS);

    if (result != VK_SUCCESS)
    {
        Log(logERROR, "ResetProfilerState() failed in VktCmdBufProfiler::GetCmdBufResults()\n");
    }

    return profilerResultCode;
}
Ejemplo n.º 5
0
//-----------------------------------------------------------------------------
/// Collect and store calibration timestamps from the CPU and GPU to align execution results in a single timeline.
/// \param pWrappedQueue The Queue responsible for work submission.
/// \param pTimestamps The timestamps structure used to hold timestamps occurring before and after workload execution.
//-----------------------------------------------------------------------------
VkResult VktFrameProfilerLayer::CollectCalibrationTimestamps(VktWrappedQueue* pWrappedQueue, CalibrationTimestampPair* pTimestamps)
{
    VkResult result = VK_INCOMPLETE;

#if MANUAL_TIMESTAMP_CALIBRATION
    if ((pWrappedQueue != nullptr) && (pTimestamps != nullptr))
    {
        VkQueue queue = pWrappedQueue->AppHandle();
        VkDevice device = pWrappedQueue->ParentDevice();

        TimestampedCmdBufConfig config = {};
        config.device           = device;
        config.physicalDevice   = pWrappedQueue->PhysicalDevice();
        config.mapTimestampMem  = false;
        config.pipelineLoc      = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
        config.queueFamilyIndex = pWrappedQueue->GetQueueFamilyIndex();

        VktTimestampedCmdBuf* pTimestampedCmdBuf = VktTimestampedCmdBuf::Create(config);

        if (pTimestampedCmdBuf != nullptr)
        {
            const VkCommandBuffer cmdBufs[] = { pTimestampedCmdBuf->CmdBufHandle() };

            VkSubmitInfo submitInfo = {};
            submitInfo.sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO;
            submitInfo.pNext                = nullptr;
            submitInfo.waitSemaphoreCount   = 0;
            submitInfo.pWaitSemaphores      = nullptr;
            submitInfo.pWaitDstStageMask    = nullptr;
            submitInfo.commandBufferCount   = 1;
            submitInfo.pCommandBuffers      = cmdBufs;
            submitInfo.signalSemaphoreCount = 0;
            submitInfo.pSignalSemaphores    = nullptr;

            VkFence fence = VK_NULL_HANDLE;
            VkFenceCreateInfo fenceCreateInfo = {};
            result = device_dispatch_table(queue)->CreateFence(device, &fenceCreateInfo, nullptr, &fence);

            if (result == VK_SUCCESS)
            {
                LARGE_INTEGER largeInt = {};

                result = pWrappedQueue->QueueSubmit_ICD(queue, 1, &submitInfo, fence);
                VKT_ASSERT(result == VK_SUCCESS);

                VkResult waitResult = VK_TIMEOUT;
                do
                {
                    waitResult = device_dispatch_table(device)->WaitForFences(device, 1, &fence, VK_TRUE, GPU_FENCE_TIMEOUT_TIME);
                } while (waitResult == VK_TIMEOUT);

                // Fetch the GPU counter
                pTimestampedCmdBuf->GetTimestampResult(&pTimestamps->mBeforeExecutionGPUTimestamp);

#ifdef WIN32
                // Immediately after, fetch the CPU counter
                QueryPerformanceCounter(&largeInt);

                pTimestamps->mBeforeExecutionCPUTimestamp = largeInt.QuadPart;
#endif

                pTimestamps->mQueueFrequency = (UINT64)pWrappedQueue->GetTimestampFrequency();

                device_dispatch_table(device)->DestroyFence(device, fence, nullptr);
            }

            delete pTimestampedCmdBuf;
            pTimestampedCmdBuf = nullptr;
        }
    }
#else
    UNREFERENCED_PARAMETER(pWrappedQueue);
    UNREFERENCED_PARAMETER(pTimestamps);
#endif

    return result;
}