void D3D12Replay::InitPostVSBuffers(uint32_t eventId) { // go through any aliasing if(m_PostVSAlias.find(eventId) != m_PostVSAlias.end()) eventId = m_PostVSAlias[eventId]; if(m_PostVSData.find(eventId) != m_PostVSData.end()) return; D3D12CommandData *cmd = m_pDevice->GetQueue()->GetCommandData(); const D3D12RenderState &rs = cmd->m_RenderState; if(rs.pipe == ResourceId()) return; WrappedID3D12PipelineState *origPSO = m_pDevice->GetResourceManager()->GetCurrentAs<WrappedID3D12PipelineState>(rs.pipe); if(!origPSO->IsGraphics()) return; D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = origPSO->GetGraphicsDesc(); if(psoDesc.VS.BytecodeLength == 0) return; WrappedID3D12Shader *vs = origPSO->VS(); D3D_PRIMITIVE_TOPOLOGY topo = rs.topo; const DrawcallDescription *drawcall = m_pDevice->GetDrawcall(eventId); if(drawcall->numIndices == 0) return; DXBC::DXBCFile *dxbcVS = vs->GetDXBC(); RDCASSERT(dxbcVS); DXBC::DXBCFile *dxbcGS = NULL; WrappedID3D12Shader *gs = origPSO->GS(); if(gs) { dxbcGS = gs->GetDXBC(); RDCASSERT(dxbcGS); } DXBC::DXBCFile *dxbcDS = NULL; WrappedID3D12Shader *ds = origPSO->DS(); if(ds) { dxbcDS = ds->GetDXBC(); RDCASSERT(dxbcDS); } ID3D12RootSignature *soSig = NULL; HRESULT hr = S_OK; { WrappedID3D12RootSignature *sig = m_pDevice->GetResourceManager()->GetCurrentAs<WrappedID3D12RootSignature>(rs.graphics.rootsig); D3D12RootSignature rootsig = sig->sig; // create a root signature that allows stream out, if necessary if((rootsig.Flags & D3D12_ROOT_SIGNATURE_FLAG_ALLOW_STREAM_OUTPUT) == 0) { rootsig.Flags |= D3D12_ROOT_SIGNATURE_FLAG_ALLOW_STREAM_OUTPUT; ID3DBlob *blob = m_pDevice->GetShaderCache()->MakeRootSig(rootsig); hr = m_pDevice->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(), __uuidof(ID3D12RootSignature), (void **)&soSig); if(FAILED(hr)) { RDCERR("Couldn't enable stream-out in root signature: HRESULT: %s", ToStr(hr).c_str()); return; } SAFE_RELEASE(blob); } } vector<D3D12_SO_DECLARATION_ENTRY> sodecls; UINT stride = 0; int posidx = -1; int numPosComponents = 0; if(!dxbcVS->m_OutputSig.empty()) { for(const SigParameter &sign : dxbcVS->m_OutputSig) { D3D12_SO_DECLARATION_ENTRY decl; decl.Stream = 0; decl.OutputSlot = 0; decl.SemanticName = sign.semanticName.c_str(); decl.SemanticIndex = sign.semanticIndex; decl.StartComponent = 0; decl.ComponentCount = sign.compCount & 0xff; if(sign.systemValue == ShaderBuiltin::Position) { posidx = (int)sodecls.size(); numPosComponents = decl.ComponentCount = 4; } stride += decl.ComponentCount * sizeof(float); sodecls.push_back(decl); } if(stride == 0) { RDCERR("Didn't get valid stride! Setting to 4 bytes"); stride = 4; } // shift position attribute up to first, keeping order otherwise // the same if(posidx > 0) { D3D12_SO_DECLARATION_ENTRY pos = sodecls[posidx]; sodecls.erase(sodecls.begin() + posidx); sodecls.insert(sodecls.begin(), pos); } // set up stream output entries and buffers psoDesc.StreamOutput.NumEntries = (UINT)sodecls.size(); psoDesc.StreamOutput.pSODeclaration = &sodecls[0]; psoDesc.StreamOutput.NumStrides = 1; psoDesc.StreamOutput.pBufferStrides = &stride; psoDesc.StreamOutput.RasterizedStream = D3D12_SO_NO_RASTERIZED_STREAM; // disable all other shader stages psoDesc.HS.BytecodeLength = 0; psoDesc.HS.pShaderBytecode = NULL; psoDesc.DS.BytecodeLength = 0; psoDesc.DS.pShaderBytecode = NULL; psoDesc.GS.BytecodeLength = 0; psoDesc.GS.pShaderBytecode = NULL; psoDesc.PS.BytecodeLength = 0; psoDesc.PS.pShaderBytecode = NULL; // disable any rasterization/use of output targets psoDesc.DepthStencilState.DepthEnable = FALSE; psoDesc.DepthStencilState.DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ZERO; psoDesc.DepthStencilState.StencilEnable = FALSE; if(soSig) psoDesc.pRootSignature = soSig; // render as points psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT; // disable outputs psoDesc.NumRenderTargets = 0; RDCEraseEl(psoDesc.RTVFormats); psoDesc.DSVFormat = DXGI_FORMAT_UNKNOWN; ID3D12PipelineState *pipe = NULL; hr = m_pDevice->CreateGraphicsPipelineState(&psoDesc, __uuidof(ID3D12PipelineState), (void **)&pipe); if(FAILED(hr)) { RDCERR("Couldn't create patched graphics pipeline: HRESULT: %s", ToStr(hr).c_str()); SAFE_RELEASE(soSig); return; } ID3D12Resource *idxBuf = NULL; bool recreate = false; uint64_t outputSize = uint64_t(drawcall->numIndices) * drawcall->numInstances * stride; if(m_SOBufferSize < outputSize) { uint64_t oldSize = m_SOBufferSize; while(m_SOBufferSize < outputSize) m_SOBufferSize *= 2; RDCWARN("Resizing stream-out buffer from %llu to %llu for output data", oldSize, m_SOBufferSize); recreate = true; } ID3D12GraphicsCommandList *list = NULL; if(!(drawcall->flags & DrawFlags::UseIBuffer)) { if(recreate) { m_pDevice->GPUSync(); CreateSOBuffers(); } list = GetDebugManager()->ResetDebugList(); rs.ApplyState(list); list->SetPipelineState(pipe); if(soSig) { list->SetGraphicsRootSignature(soSig); rs.ApplyGraphicsRootElements(list); } D3D12_STREAM_OUTPUT_BUFFER_VIEW view; view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; list->SOSetTargets(0, 1, &view); list->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST); list->DrawInstanced(drawcall->numIndices, drawcall->numInstances, drawcall->vertexOffset, drawcall->instanceOffset); } else // drawcall is indexed { bytebuf idxdata; GetBufferData(rs.ibuffer.buf, rs.ibuffer.offs + drawcall->indexOffset * rs.ibuffer.bytewidth, RDCMIN(drawcall->numIndices * rs.ibuffer.bytewidth, rs.ibuffer.size), idxdata); vector<uint32_t> indices; uint16_t *idx16 = (uint16_t *)&idxdata[0]; uint32_t *idx32 = (uint32_t *)&idxdata[0]; // only read as many indices as were available in the buffer uint32_t numIndices = RDCMIN(uint32_t(idxdata.size() / rs.ibuffer.bytewidth), drawcall->numIndices); uint32_t idxclamp = 0; if(drawcall->baseVertex < 0) idxclamp = uint32_t(-drawcall->baseVertex); // grab all unique vertex indices referenced for(uint32_t i = 0; i < numIndices; i++) { uint32_t i32 = rs.ibuffer.bytewidth == 2 ? uint32_t(idx16[i]) : idx32[i]; // apply baseVertex but clamp to 0 (don't allow index to become negative) if(i32 < idxclamp) i32 = 0; else if(drawcall->baseVertex < 0) i32 -= idxclamp; else if(drawcall->baseVertex > 0) i32 += drawcall->baseVertex; auto it = std::lower_bound(indices.begin(), indices.end(), i32); if(it != indices.end() && *it == i32) continue; indices.insert(it, i32); } // if we read out of bounds, we'll also have a 0 index being referenced // (as 0 is read). Don't insert 0 if we already have 0 though if(numIndices < drawcall->numIndices && (indices.empty() || indices[0] != 0)) indices.insert(indices.begin(), 0); // An index buffer could be something like: 500, 501, 502, 501, 503, 502 // in which case we can't use the existing index buffer without filling 499 slots of vertex // data with padding. Instead we rebase the indices based on the smallest vertex so it becomes // 0, 1, 2, 1, 3, 2 and then that matches our stream-out'd buffer. // // Note that there could also be gaps, like: 500, 501, 502, 510, 511, 512 // which would become 0, 1, 2, 3, 4, 5 and so the old index buffer would no longer be valid. // We just stream-out a tightly packed list of unique indices, and then remap the index buffer // so that what did point to 500 points to 0 (accounting for rebasing), and what did point // to 510 now points to 3 (accounting for the unique sort). // we use a map here since the indices may be sparse. Especially considering if an index // is 'invalid' like 0xcccccccc then we don't want an array of 3.4 billion entries. map<uint32_t, size_t> indexRemap; for(size_t i = 0; i < indices.size(); i++) { // by definition, this index will only appear once in indices[] indexRemap[indices[i]] = i; } if(m_SOBufferSize / sizeof(Vec4f) < indices.size() * sizeof(uint32_t)) { uint64_t oldSize = m_SOBufferSize; while(m_SOBufferSize / sizeof(Vec4f) < indices.size() * sizeof(uint32_t)) m_SOBufferSize *= 2; RDCWARN("Resizing stream-out buffer from %llu to %llu for indices", oldSize, m_SOBufferSize); recreate = true; } if(recreate) { m_pDevice->GPUSync(); CreateSOBuffers(); } GetDebugManager()->FillBuffer(m_SOPatchedIndexBuffer, 0, &indices[0], indices.size() * sizeof(uint32_t)); D3D12_INDEX_BUFFER_VIEW patchedIB; patchedIB.BufferLocation = m_SOPatchedIndexBuffer->GetGPUVirtualAddress(); patchedIB.Format = DXGI_FORMAT_R32_UINT; patchedIB.SizeInBytes = UINT(indices.size() * sizeof(uint32_t)); list = GetDebugManager()->ResetDebugList(); rs.ApplyState(list); list->SetPipelineState(pipe); list->IASetIndexBuffer(&patchedIB); if(soSig) { list->SetGraphicsRootSignature(soSig); rs.ApplyGraphicsRootElements(list); } D3D12_STREAM_OUTPUT_BUFFER_VIEW view; view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; list->SOSetTargets(0, 1, &view); list->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST); list->DrawIndexedInstanced((UINT)indices.size(), drawcall->numInstances, 0, 0, drawcall->instanceOffset); uint32_t stripCutValue = 0; if(psoDesc.IBStripCutValue == D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF) stripCutValue = 0xffff; else if(psoDesc.IBStripCutValue == D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF) stripCutValue = 0xffffffff; // rebase existing index buffer to point to the right elements in our stream-out'd // vertex buffer for(uint32_t i = 0; i < numIndices; i++) { uint32_t i32 = rs.ibuffer.bytewidth == 2 ? uint32_t(idx16[i]) : idx32[i]; // preserve primitive restart indices if(stripCutValue && i32 == stripCutValue) continue; // apply baseVertex but clamp to 0 (don't allow index to become negative) if(i32 < idxclamp) i32 = 0; else if(drawcall->baseVertex < 0) i32 -= idxclamp; else if(drawcall->baseVertex > 0) i32 += drawcall->baseVertex; if(rs.ibuffer.bytewidth == 2) idx16[i] = uint16_t(indexRemap[i32]); else idx32[i] = uint32_t(indexRemap[i32]); } idxBuf = NULL; if(!idxdata.empty()) { D3D12_RESOURCE_DESC idxBufDesc; idxBufDesc.Alignment = 0; idxBufDesc.DepthOrArraySize = 1; idxBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; idxBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE; idxBufDesc.Format = DXGI_FORMAT_UNKNOWN; idxBufDesc.Height = 1; idxBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; idxBufDesc.MipLevels = 1; idxBufDesc.SampleDesc.Count = 1; idxBufDesc.SampleDesc.Quality = 0; idxBufDesc.Width = idxdata.size(); D3D12_HEAP_PROPERTIES heapProps; heapProps.Type = D3D12_HEAP_TYPE_UPLOAD; heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; heapProps.CreationNodeMask = 1; heapProps.VisibleNodeMask = 1; hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &idxBufDesc, D3D12_RESOURCE_STATE_GENERIC_READ, NULL, __uuidof(ID3D12Resource), (void **)&idxBuf); RDCASSERTEQUAL(hr, S_OK); SetObjName(idxBuf, StringFormat::Fmt("PostVS idxBuf for %u", eventId)); GetDebugManager()->FillBuffer(idxBuf, 0, &idxdata[0], idxdata.size()); } } D3D12_RESOURCE_BARRIER sobarr = {}; sobarr.Transition.pResource = m_SOBuffer; sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT; sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; list->ResourceBarrier(1, &sobarr); list->CopyResource(m_SOStagingBuffer, m_SOBuffer); // we're done with this after the copy, so we can discard it and reset // the counter for the next stream-out sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE; sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; list->DiscardResource(m_SOBuffer, NULL); list->ResourceBarrier(1, &sobarr); UINT zeroes[4] = {0, 0, 0, 0}; list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV), GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV), m_SOBuffer, zeroes, 0, NULL); list->Close(); ID3D12CommandList *l = list; m_pDevice->GetQueue()->ExecuteCommandLists(1, &l); m_pDevice->GPUSync(); GetDebugManager()->ResetDebugAlloc(); SAFE_RELEASE(pipe); byte *byteData = NULL; D3D12_RANGE range = {0, (SIZE_T)m_SOBufferSize}; hr = m_SOStagingBuffer->Map(0, &range, (void **)&byteData); if(FAILED(hr)) { RDCERR("Failed to map sobuffer HRESULT: %s", ToStr(hr).c_str()); SAFE_RELEASE(idxBuf); SAFE_RELEASE(soSig); return; } range.End = 0; uint64_t numBytesWritten = *(uint64_t *)byteData; if(numBytesWritten == 0) { m_PostVSData[eventId] = D3D12PostVSData(); SAFE_RELEASE(idxBuf); SAFE_RELEASE(soSig); return; } // skip past the counter byteData += 64; uint64_t numPrims = numBytesWritten / stride; ID3D12Resource *vsoutBuffer = NULL; { D3D12_RESOURCE_DESC vertBufDesc; vertBufDesc.Alignment = 0; vertBufDesc.DepthOrArraySize = 1; vertBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; vertBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE; vertBufDesc.Format = DXGI_FORMAT_UNKNOWN; vertBufDesc.Height = 1; vertBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; vertBufDesc.MipLevels = 1; vertBufDesc.SampleDesc.Count = 1; vertBufDesc.SampleDesc.Quality = 0; vertBufDesc.Width = numBytesWritten; D3D12_HEAP_PROPERTIES heapProps; heapProps.Type = D3D12_HEAP_TYPE_UPLOAD; heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; heapProps.CreationNodeMask = 1; heapProps.VisibleNodeMask = 1; hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &vertBufDesc, D3D12_RESOURCE_STATE_GENERIC_READ, NULL, __uuidof(ID3D12Resource), (void **)&vsoutBuffer); RDCASSERTEQUAL(hr, S_OK); if(vsoutBuffer) { SetObjName(vsoutBuffer, StringFormat::Fmt("PostVS vsoutBuffer for %u", eventId)); GetDebugManager()->FillBuffer(vsoutBuffer, 0, byteData, (size_t)numBytesWritten); } } float nearp = 0.1f; float farp = 100.0f; Vec4f *pos0 = (Vec4f *)byteData; bool found = false; for(uint64_t i = 1; numPosComponents == 4 && i < numPrims; i++) { ////////////////////////////////////////////////////////////////////////////////// // derive near/far, assuming a standard perspective matrix // // the transformation from from pre-projection {Z,W} to post-projection {Z,W} // is linear. So we can say Zpost = Zpre*m + c . Here we assume Wpre = 1 // and we know Wpost = Zpre from the perspective matrix. // we can then see from the perspective matrix that // m = F/(F-N) // c = -(F*N)/(F-N) // // with re-arranging and substitution, we then get: // N = -c/m // F = c/(1-m) // // so if we can derive m and c then we can determine N and F. We can do this with // two points, and we pick them reasonably distinct on z to reduce floating-point // error Vec4f *pos = (Vec4f *)(byteData + i * stride); if(fabs(pos->w - pos0->w) > 0.01f && fabs(pos->z - pos0->z) > 0.01f) { Vec2f A(pos0->w, pos0->z); Vec2f B(pos->w, pos->z); float m = (B.y - A.y) / (B.x - A.x); float c = B.y - B.x * m; if(m == 1.0f) continue; nearp = -c / m; farp = c / (1 - m); found = true; break; } } // if we didn't find anything, all z's and w's were identical. // If the z is positive and w greater for the first element then // we detect this projection as reversed z with infinite far plane if(!found && pos0->z > 0.0f && pos0->w > pos0->z) { nearp = pos0->z; farp = FLT_MAX; } m_SOStagingBuffer->Unmap(0, &range); m_PostVSData[eventId].vsin.topo = topo; m_PostVSData[eventId].vsout.buf = vsoutBuffer; m_PostVSData[eventId].vsout.vertStride = stride; m_PostVSData[eventId].vsout.nearPlane = nearp; m_PostVSData[eventId].vsout.farPlane = farp; m_PostVSData[eventId].vsout.useIndices = bool(drawcall->flags & DrawFlags::UseIBuffer); m_PostVSData[eventId].vsout.numVerts = drawcall->numIndices; m_PostVSData[eventId].vsout.instStride = 0; if(drawcall->flags & DrawFlags::Instanced) m_PostVSData[eventId].vsout.instStride = uint32_t(numBytesWritten / RDCMAX(1U, drawcall->numInstances)); m_PostVSData[eventId].vsout.idxBuf = NULL; if(m_PostVSData[eventId].vsout.useIndices && idxBuf) { m_PostVSData[eventId].vsout.idxBuf = idxBuf; m_PostVSData[eventId].vsout.idxFmt = rs.ibuffer.bytewidth == 2 ? DXGI_FORMAT_R16_UINT : DXGI_FORMAT_R32_UINT; } m_PostVSData[eventId].vsout.hasPosOut = posidx >= 0; m_PostVSData[eventId].vsout.topo = topo; } else { // empty vertex output signature m_PostVSData[eventId].vsin.topo = topo; m_PostVSData[eventId].vsout.buf = NULL; m_PostVSData[eventId].vsout.instStride = 0; m_PostVSData[eventId].vsout.vertStride = 0; m_PostVSData[eventId].vsout.nearPlane = 0.0f; m_PostVSData[eventId].vsout.farPlane = 0.0f; m_PostVSData[eventId].vsout.useIndices = false; m_PostVSData[eventId].vsout.hasPosOut = false; m_PostVSData[eventId].vsout.idxBuf = NULL; m_PostVSData[eventId].vsout.topo = topo; } if(dxbcGS || dxbcDS) { stride = 0; posidx = -1; numPosComponents = 0; DXBC::DXBCFile *lastShader = dxbcGS; if(dxbcDS) lastShader = dxbcDS; sodecls.clear(); for(const SigParameter &sign : lastShader->m_OutputSig) { D3D12_SO_DECLARATION_ENTRY decl; // for now, skip streams that aren't stream 0 if(sign.stream != 0) continue; decl.Stream = 0; decl.OutputSlot = 0; decl.SemanticName = sign.semanticName.c_str(); decl.SemanticIndex = sign.semanticIndex; decl.StartComponent = 0; decl.ComponentCount = sign.compCount & 0xff; if(sign.systemValue == ShaderBuiltin::Position) { posidx = (int)sodecls.size(); numPosComponents = decl.ComponentCount = 4; } stride += decl.ComponentCount * sizeof(float); sodecls.push_back(decl); } // shift position attribute up to first, keeping order otherwise // the same if(posidx > 0) { D3D12_SO_DECLARATION_ENTRY pos = sodecls[posidx]; sodecls.erase(sodecls.begin() + posidx); sodecls.insert(sodecls.begin(), pos); } // enable the other shader stages again if(origPSO->DS()) psoDesc.DS = origPSO->DS()->GetDesc(); if(origPSO->HS()) psoDesc.HS = origPSO->HS()->GetDesc(); if(origPSO->GS()) psoDesc.GS = origPSO->GS()->GetDesc(); // configure new SO declarations psoDesc.StreamOutput.NumEntries = (UINT)sodecls.size(); psoDesc.StreamOutput.pSODeclaration = &sodecls[0]; psoDesc.StreamOutput.NumStrides = 1; psoDesc.StreamOutput.pBufferStrides = &stride; // we're using the same topology this time psoDesc.PrimitiveTopologyType = origPSO->graphics->PrimitiveTopologyType; ID3D12PipelineState *pipe = NULL; hr = m_pDevice->CreateGraphicsPipelineState(&psoDesc, __uuidof(ID3D12PipelineState), (void **)&pipe); if(FAILED(hr)) { RDCERR("Couldn't create patched graphics pipeline: HRESULT: %s", ToStr(hr).c_str()); SAFE_RELEASE(soSig); return; } D3D12_STREAM_OUTPUT_BUFFER_VIEW view; ID3D12GraphicsCommandList *list = NULL; view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; // draws with multiple instances must be replayed one at a time so we can record the number of // primitives from each drawcall, as due to expansion this can vary per-instance. if(drawcall->numInstances > 1) { list = GetDebugManager()->ResetDebugList(); rs.ApplyState(list); list->SetPipelineState(pipe); if(soSig) { list->SetGraphicsRootSignature(soSig); rs.ApplyGraphicsRootElements(list); } view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; // do a dummy draw to make sure we have enough space in the output buffer list->SOSetTargets(0, 1, &view); list->BeginQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0); // because the result is expanded we don't have to remap index buffers or anything if(drawcall->flags & DrawFlags::UseIBuffer) { list->DrawIndexedInstanced(drawcall->numIndices, drawcall->numInstances, drawcall->indexOffset, drawcall->baseVertex, drawcall->instanceOffset); } else { list->DrawInstanced(drawcall->numIndices, drawcall->numInstances, drawcall->vertexOffset, drawcall->instanceOffset); } list->EndQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0); list->ResolveQueryData(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0, 1, m_SOStagingBuffer, 0); list->Close(); ID3D12CommandList *l = list; m_pDevice->GetQueue()->ExecuteCommandLists(1, &l); m_pDevice->GPUSync(); // check that things are OK, and resize up if needed D3D12_RANGE range; range.Begin = 0; range.End = (SIZE_T)sizeof(D3D12_QUERY_DATA_SO_STATISTICS); D3D12_QUERY_DATA_SO_STATISTICS *data; hr = m_SOStagingBuffer->Map(0, &range, (void **)&data); D3D12_QUERY_DATA_SO_STATISTICS result = *data; range.End = 0; m_SOStagingBuffer->Unmap(0, &range); if(m_SOBufferSize < data->PrimitivesStorageNeeded * 3 * stride) { uint64_t oldSize = m_SOBufferSize; while(m_SOBufferSize < data->PrimitivesStorageNeeded * 3 * stride) m_SOBufferSize *= 2; RDCWARN("Resizing stream-out buffer from %llu to %llu for output", oldSize, m_SOBufferSize); CreateSOBuffers(); } view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; GetDebugManager()->ResetDebugAlloc(); // now do the actual stream out list = GetDebugManager()->ResetDebugList(); // first need to reset the counter byte values which may have either been written to above, or // are newly created { D3D12_RESOURCE_BARRIER sobarr = {}; sobarr.Transition.pResource = m_SOBuffer; sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT; sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; list->ResourceBarrier(1, &sobarr); D3D12_UNORDERED_ACCESS_VIEW_DESC counterDesc = {}; counterDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; counterDesc.Format = DXGI_FORMAT_R32_UINT; counterDesc.Buffer.FirstElement = 0; counterDesc.Buffer.NumElements = 4; UINT zeroes[4] = {0, 0, 0, 0}; list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV), GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV), m_SOBuffer, zeroes, 0, NULL); std::swap(sobarr.Transition.StateBefore, sobarr.Transition.StateAfter); list->ResourceBarrier(1, &sobarr); } rs.ApplyState(list); list->SetPipelineState(pipe); if(soSig) { list->SetGraphicsRootSignature(soSig); rs.ApplyGraphicsRootElements(list); } // reserve space for enough 'buffer filled size' locations view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + AlignUp(uint64_t(drawcall->numInstances * sizeof(UINT64)), 64ULL); // do incremental draws to get the output size. We have to do this O(N^2) style because // there's no way to replay only a single instance. We have to replay 1, 2, 3, ... N instances // and count the total number of verts each time, then we can see from the difference how much // each instance wrote. for(uint32_t inst = 1; inst <= drawcall->numInstances; inst++) { if(drawcall->flags & DrawFlags::UseIBuffer) { view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress() + (inst - 1) * sizeof(UINT64); list->SOSetTargets(0, 1, &view); list->DrawIndexedInstanced(drawcall->numIndices, inst, drawcall->indexOffset, drawcall->baseVertex, drawcall->instanceOffset); } else { view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress() + (inst - 1) * sizeof(UINT64); list->SOSetTargets(0, 1, &view); list->DrawInstanced(drawcall->numIndices, inst, drawcall->vertexOffset, drawcall->instanceOffset); } } list->Close(); l = list; m_pDevice->GetQueue()->ExecuteCommandLists(1, &l); m_pDevice->GPUSync(); GetDebugManager()->ResetDebugAlloc(); // the last draw will have written the actual data we want into the buffer } else { // this only loops if we find from a query that we need to resize up while(true) { list = GetDebugManager()->ResetDebugList(); rs.ApplyState(list); list->SetPipelineState(pipe); if(soSig) { list->SetGraphicsRootSignature(soSig); rs.ApplyGraphicsRootElements(list); } view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress(); view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64; view.SizeInBytes = m_SOBufferSize; list->SOSetTargets(0, 1, &view); list->BeginQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0); // because the result is expanded we don't have to remap index buffers or anything if(drawcall->flags & DrawFlags::UseIBuffer) { list->DrawIndexedInstanced(drawcall->numIndices, drawcall->numInstances, drawcall->indexOffset, drawcall->baseVertex, drawcall->instanceOffset); } else { list->DrawInstanced(drawcall->numIndices, drawcall->numInstances, drawcall->vertexOffset, drawcall->instanceOffset); } list->EndQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0); list->ResolveQueryData(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0, 1, m_SOStagingBuffer, 0); list->Close(); ID3D12CommandList *l = list; m_pDevice->GetQueue()->ExecuteCommandLists(1, &l); m_pDevice->GPUSync(); // check that things are OK, and resize up if needed D3D12_RANGE range; range.Begin = 0; range.End = (SIZE_T)sizeof(D3D12_QUERY_DATA_SO_STATISTICS); D3D12_QUERY_DATA_SO_STATISTICS *data; hr = m_SOStagingBuffer->Map(0, &range, (void **)&data); if(m_SOBufferSize < data->PrimitivesStorageNeeded * 3 * stride) { uint64_t oldSize = m_SOBufferSize; while(m_SOBufferSize < data->PrimitivesStorageNeeded * 3 * stride) m_SOBufferSize *= 2; RDCWARN("Resizing stream-out buffer from %llu to %llu for output", oldSize, m_SOBufferSize); CreateSOBuffers(); continue; } range.End = 0; m_SOStagingBuffer->Unmap(0, &range); GetDebugManager()->ResetDebugAlloc(); break; } } list = GetDebugManager()->ResetDebugList(); D3D12_RESOURCE_BARRIER sobarr = {}; sobarr.Transition.pResource = m_SOBuffer; sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT; sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; list->ResourceBarrier(1, &sobarr); list->CopyResource(m_SOStagingBuffer, m_SOBuffer); // we're done with this after the copy, so we can discard it and reset // the counter for the next stream-out sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE; sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; list->DiscardResource(m_SOBuffer, NULL); list->ResourceBarrier(1, &sobarr); D3D12_UNORDERED_ACCESS_VIEW_DESC counterDesc = {}; counterDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; counterDesc.Format = DXGI_FORMAT_R32_UINT; counterDesc.Buffer.FirstElement = 0; counterDesc.Buffer.NumElements = 4; UINT zeroes[4] = {0, 0, 0, 0}; list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV), GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV), m_SOBuffer, zeroes, 0, NULL); list->Close(); ID3D12CommandList *l = list; m_pDevice->GetQueue()->ExecuteCommandLists(1, &l); m_pDevice->GPUSync(); GetDebugManager()->ResetDebugAlloc(); SAFE_RELEASE(pipe); byte *byteData = NULL; D3D12_RANGE range = {0, (SIZE_T)m_SOBufferSize}; hr = m_SOStagingBuffer->Map(0, &range, (void **)&byteData); if(FAILED(hr)) { RDCERR("Failed to map sobuffer HRESULT: %s", ToStr(hr).c_str()); SAFE_RELEASE(soSig); return; } range.End = 0; uint64_t *counters = (uint64_t *)byteData; uint64_t numBytesWritten = 0; std::vector<D3D12PostVSData::InstData> instData; if(drawcall->numInstances > 1) { uint64_t prevByteCount = 0; for(uint32_t inst = 0; inst < drawcall->numInstances; inst++) { uint64_t byteCount = counters[inst]; D3D12PostVSData::InstData d; d.numVerts = uint32_t((byteCount - prevByteCount) / stride); d.bufOffset = prevByteCount; prevByteCount = byteCount; instData.push_back(d); } numBytesWritten = prevByteCount; } else { numBytesWritten = counters[0]; } if(numBytesWritten == 0) { SAFE_RELEASE(soSig); return; } // skip past the counter(s) byteData += (view.BufferLocation - m_SOBuffer->GetGPUVirtualAddress()); uint64_t numVerts = numBytesWritten / stride; ID3D12Resource *gsoutBuffer = NULL; { D3D12_RESOURCE_DESC vertBufDesc; vertBufDesc.Alignment = 0; vertBufDesc.DepthOrArraySize = 1; vertBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; vertBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE; vertBufDesc.Format = DXGI_FORMAT_UNKNOWN; vertBufDesc.Height = 1; vertBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; vertBufDesc.MipLevels = 1; vertBufDesc.SampleDesc.Count = 1; vertBufDesc.SampleDesc.Quality = 0; vertBufDesc.Width = numBytesWritten; D3D12_HEAP_PROPERTIES heapProps; heapProps.Type = D3D12_HEAP_TYPE_UPLOAD; heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; heapProps.CreationNodeMask = 1; heapProps.VisibleNodeMask = 1; hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &vertBufDesc, D3D12_RESOURCE_STATE_GENERIC_READ, NULL, __uuidof(ID3D12Resource), (void **)&gsoutBuffer); RDCASSERTEQUAL(hr, S_OK); if(gsoutBuffer) { SetObjName(gsoutBuffer, StringFormat::Fmt("PostVS gsoutBuffer for %u", eventId)); GetDebugManager()->FillBuffer(gsoutBuffer, 0, byteData, (size_t)numBytesWritten); } } float nearp = 0.1f; float farp = 100.0f; Vec4f *pos0 = (Vec4f *)byteData; bool found = false; for(UINT64 i = 1; numPosComponents == 4 && i < numVerts; i++) { ////////////////////////////////////////////////////////////////////////////////// // derive near/far, assuming a standard perspective matrix // // the transformation from from pre-projection {Z,W} to post-projection {Z,W} // is linear. So we can say Zpost = Zpre*m + c . Here we assume Wpre = 1 // and we know Wpost = Zpre from the perspective matrix. // we can then see from the perspective matrix that // m = F/(F-N) // c = -(F*N)/(F-N) // // with re-arranging and substitution, we then get: // N = -c/m // F = c/(1-m) // // so if we can derive m and c then we can determine N and F. We can do this with // two points, and we pick them reasonably distinct on z to reduce floating-point // error Vec4f *pos = (Vec4f *)(byteData + i * stride); if(fabs(pos->w - pos0->w) > 0.01f && fabs(pos->z - pos0->z) > 0.01f) { Vec2f A(pos0->w, pos0->z); Vec2f B(pos->w, pos->z); float m = (B.y - A.y) / (B.x - A.x); float c = B.y - B.x * m; if(m == 1.0f) continue; nearp = -c / m; farp = c / (1 - m); found = true; break; } } // if we didn't find anything, all z's and w's were identical. // If the z is positive and w greater for the first element then // we detect this projection as reversed z with infinite far plane if(!found && pos0->z > 0.0f && pos0->w > pos0->z) { nearp = pos0->z; farp = FLT_MAX; } m_SOStagingBuffer->Unmap(0, &range); m_PostVSData[eventId].gsout.buf = gsoutBuffer; m_PostVSData[eventId].gsout.instStride = 0; if(drawcall->flags & DrawFlags::Instanced) m_PostVSData[eventId].gsout.instStride = uint32_t(numBytesWritten / RDCMAX(1U, drawcall->numInstances)); m_PostVSData[eventId].gsout.vertStride = stride; m_PostVSData[eventId].gsout.nearPlane = nearp; m_PostVSData[eventId].gsout.farPlane = farp; m_PostVSData[eventId].gsout.useIndices = false; m_PostVSData[eventId].gsout.hasPosOut = posidx >= 0; m_PostVSData[eventId].gsout.idxBuf = NULL; topo = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; if(lastShader == dxbcGS) { for(size_t i = 0; i < dxbcGS->GetNumDeclarations(); i++) { const DXBC::ASMDecl &decl = dxbcGS->GetDeclaration(i); if(decl.declaration == DXBC::OPCODE_DCL_GS_OUTPUT_PRIMITIVE_TOPOLOGY) { topo = decl.outTopology; break; } } } else if(lastShader == dxbcDS) { for(size_t i = 0; i < dxbcDS->GetNumDeclarations(); i++) { const DXBC::ASMDecl &decl = dxbcDS->GetDeclaration(i); if(decl.declaration == DXBC::OPCODE_DCL_TESS_DOMAIN) { if(decl.domain == DXBC::DOMAIN_ISOLINE) topo = D3D_PRIMITIVE_TOPOLOGY_LINELIST; else topo = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; break; } } } m_PostVSData[eventId].gsout.topo = topo; // streamout expands strips unfortunately if(topo == D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP) m_PostVSData[eventId].gsout.topo = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; else if(topo == D3D11_PRIMITIVE_TOPOLOGY_LINESTRIP) m_PostVSData[eventId].gsout.topo = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; else if(topo == D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP_ADJ) m_PostVSData[eventId].gsout.topo = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST_ADJ; else if(topo == D3D11_PRIMITIVE_TOPOLOGY_LINESTRIP_ADJ) m_PostVSData[eventId].gsout.topo = D3D11_PRIMITIVE_TOPOLOGY_LINELIST_ADJ; m_PostVSData[eventId].gsout.numVerts = (uint32_t)numVerts; if(drawcall->flags & DrawFlags::Instanced) m_PostVSData[eventId].gsout.numVerts /= RDCMAX(1U, drawcall->numInstances); m_PostVSData[eventId].gsout.instData = instData; } SAFE_RELEASE(soSig); }
vector<CounterResult> D3D12Replay::FetchCounters(const vector<uint32_t> &counters) { uint32_t maxEID = m_pDevice->GetQueue()->GetMaxEID(); vector<CounterResult> ret; D3D12_HEAP_PROPERTIES heapProps; heapProps.Type = D3D12_HEAP_TYPE_READBACK; heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; heapProps.CreationNodeMask = 1; heapProps.VisibleNodeMask = 1; D3D12_RESOURCE_DESC bufDesc; bufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; bufDesc.Alignment = 0; bufDesc.Width = sizeof(uint64_t) * maxEID * 2; bufDesc.Height = 1; bufDesc.DepthOrArraySize = 1; bufDesc.MipLevels = 1; bufDesc.Format = DXGI_FORMAT_UNKNOWN; bufDesc.SampleDesc.Count = 1; bufDesc.SampleDesc.Quality = 0; bufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; bufDesc.Flags = D3D12_RESOURCE_FLAG_NONE; ID3D12Resource *readbackBuf; HRESULT hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &bufDesc, D3D12_RESOURCE_STATE_COPY_DEST, NULL, __uuidof(ID3D12Resource), (void **)&readbackBuf); if(FAILED(hr)) { RDCERR("Failed to create query readback buffer %08x", hr); return ret; } D3D12_QUERY_HEAP_DESC queryDesc; queryDesc.Count = maxEID * 2; queryDesc.NodeMask = 1; queryDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP; ID3D12QueryHeap *queryHeap = NULL; hr = m_pDevice->CreateQueryHeap(&queryDesc, __uuidof(queryHeap), (void **)&queryHeap); if(FAILED(hr)) { RDCERR("Failed to create query heap %08x", hr); return ret; } m_pDevice->SetStablePowerState(TRUE); D3D12GPUTimerCallback cb(m_pDevice, this, queryHeap); // replay the events to perform all the queries m_pDevice->ReplayLog(0, maxEID, eReplay_Full); m_pDevice->SetStablePowerState(FALSE); ID3D12GraphicsCommandList *list = m_pDevice->GetNewList(); list->ResolveQueryData(queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, 0, maxEID * 2, readbackBuf, 0); list->Close(); m_pDevice->ExecuteLists(); m_pDevice->FlushLists(); D3D12_RANGE range = {0, (SIZE_T)bufDesc.Width}; void *data; hr = readbackBuf->Map(0, &range, &data); if(FAILED(hr)) { RDCERR("Failed to create query heap %08x", hr); SAFE_RELEASE(queryHeap); SAFE_RELEASE(readbackBuf); return ret; } uint64_t *timestamps = (uint64_t *)data; uint64_t freq; m_pDevice->GetQueue()->GetTimestampFrequency(&freq); for(size_t i = 0; i < cb.m_Results.size(); i++) { CounterResult result; uint64_t delta = timestamps[i * 2 + 1] - timestamps[i * 2 + 0]; result.eventID = cb.m_Results[i]; result.counterID = eCounter_EventGPUDuration; result.value.d = double(delta) / double(freq); ret.push_back(result); } for(size_t i = 0; i < cb.m_AliasEvents.size(); i++) { CounterResult search; search.counterID = eCounter_EventGPUDuration; search.eventID = cb.m_AliasEvents[i].first; // find the result we're aliasing auto it = std::find(ret.begin(), ret.end(), search); RDCASSERT(it != ret.end()); // duplicate the result and append CounterResult aliased = *it; aliased.eventID = cb.m_AliasEvents[i].second; ret.push_back(aliased); } // sort so that the alias results appear in the right places std::sort(ret.begin(), ret.end()); return ret; }