void CudaTracer::setKernel(const String& kernelName) { // Not changed => done. if (m_kernelName == kernelName) return; m_kernelName = kernelName; // Compile kernel. CudaModule* module = compileKernel(); // Initialize config with default values. { KernelConfig& c = *(KernelConfig*)module->getGlobal("g_config").getMutablePtr(); c.bvhLayout = BVHLayout_Max; c.blockWidth = 0; c.blockHeight = 0; c.usePersistentThreads = 0; } // Query config. module->getKernel("queryConfig").launch(1, 1); m_kernelConfig = *(const KernelConfig*)module->getGlobal("g_config").getPtr(); }
CudaRenderer::LaunchResult CudaRenderer::launch(int totalWork, bool persistentThreads) { // Setup warps. Vec2i blockSize = Vec2i(RCK_TRACE_BLOCK_WIDTH, RCK_TRACE_BLOCK_HEIGHT); int blockThreads = blockSize.x * blockSize.y; Vec2i gridSize = Vec2i(0, 1); int activeWarpTableSize = 0; m_input.totalWork = totalWork; if (persistentThreads || m_params.enablePerfCounters) { m_compiler.define("PERSISTENT_THREADS"); gridSize.x = (m_numWarps * 32 + blockThreads - 1) / blockThreads; activeWarpTableSize = (gridSize.x * gridSize.y) * ((blockThreads + 31) / 32); m_activeWarps.resizeDiscard(activeWarpTableSize * sizeof(S32)); memset(m_activeWarps.getMutablePtr(), 0, activeWarpTableSize * sizeof(S32)); m_input.activeWarps = m_activeWarps.getMutableCudaPtr(); } else { m_compiler.undef("PERSISTENT_THREADS"); gridSize.x = (totalWork + blockThreads - 1) / blockThreads; } // Compile kernel. CudaModule* module = m_compiler.compile(); // Update globals. *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input; *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0; module->setTexRef("texIndexToPixel", m_indexToPixel, CU_AD_FORMAT_UNSIGNED_INT32, 1); module->setTexRef("texFrameCoarseIn", m_coarseFrameBuffer, CU_AD_FORMAT_FLOAT, 1); module->setTexRef("texIndexToPixelCoarse", m_coarseIndexToPixel, CU_AD_FORMAT_UNSIGNED_INT32, 1); // Launch. LaunchResult res; res.time = module->getKernel("kernel").launchTimed(gridSize * blockSize, blockSize, (!m_params.measureRaycastPerf)); // Determine results. const S32* activeWarps = (const S32*)m_activeWarps.getPtr(); res.numWarps = 0; for (int i = 0; i < activeWarpTableSize; i++) if (activeWarps[i]) res.numWarps++; return res; }
F32 CudaTracer::traceBatch(RayBuffer& rays) { // No rays => done. int numRays = rays.getSize(); if (!numRays) return 0.0f; // Check BVH consistency. if (!m_bvh) fail("CudaTracer: No BVH!"); if (m_bvh->getLayout() != getDesiredBVHLayout()) fail("CudaTracer: Incorrect BVH layout!"); // Get BVH buffers. CUdeviceptr nodePtr = m_bvh->getNodeBuffer().getCudaPtr(); CUdeviceptr triPtr = m_bvh->getTriWoopBuffer().getCudaPtr(); Buffer& indexBuf = m_bvh->getTriIndexBuffer(); Vec2i nodeOfsA = m_bvh->getNodeSubArray(0); Vec2i nodeOfsB = m_bvh->getNodeSubArray(1); Vec2i nodeOfsC = m_bvh->getNodeSubArray(2); Vec2i nodeOfsD = m_bvh->getNodeSubArray(3); Vec2i triOfsA = m_bvh->getTriWoopSubArray(0); Vec2i triOfsB = m_bvh->getTriWoopSubArray(1); Vec2i triOfsC = m_bvh->getTriWoopSubArray(2); // Compile kernel. CudaModule* module = compileKernel(); CudaKernel kernel = module->getKernel("trace"); // Set parameters. kernel.setParams( numRays, // numRays (rays.getNeedClosestHit()) ? 0 : 1, // anyHit rays.getRayBuffer().getCudaPtr(), // rays rays.getResultBuffer().getMutableCudaPtr(), // results nodePtr + nodeOfsA.x, // nodesA nodePtr + nodeOfsB.x, // nodesB nodePtr + nodeOfsC.x, // nodesC nodePtr + nodeOfsD.x, // nodesD triPtr + triOfsA.x, // trisA triPtr + triOfsB.x, // trisB triPtr + triOfsC.x, // trisC indexBuf.getCudaPtr()); // triIndices // Set texture references. module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesB", nodePtr + nodeOfsB.x, nodeOfsB.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesC", nodePtr + nodeOfsC.x, nodeOfsC.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesD", nodePtr + nodeOfsD.x, nodeOfsD.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisB", triPtr + triOfsB.x, triOfsB.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisC", triPtr + triOfsC.x, triOfsC.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1); // Determine block and grid sizes. int desiredWarps = (numRays + 31) / 32; if (m_kernelConfig.usePersistentThreads != 0) { *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0; desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps } Vec2i blockSize(m_kernelConfig.blockWidth, m_kernelConfig.blockHeight); int blockWarps = (blockSize.x * blockSize.y + 31) / 32; int numBlocks = (desiredWarps + blockWarps - 1) / blockWarps; // Launch. return kernel.launchTimed(numBlocks * blockSize.x * blockSize.y, blockSize); }
String CudaRenderer::renderObject( Image& frame, OctreeRuntime* runtime, int objectID, const Mat4f& octreeToWorld, const Mat4f& worldToCamera, const Mat4f& projection) { FW_ASSERT(runtime); // Check frame buffer validity. if (frame.getSize().min() <= 0) return ""; if (frame.getFormat() != ImageFormat::ABGR_8888 || frame.getStride() != frame.getSize().x * frame.getBPP()) { return "CudaRenderer: Incompatible framebuffer!"; } // Determine preprocessor defines. const Array<AttachIO::AttachType>& attach = runtime->getAttachTypes(objectID); FW_ASSERT(attach.getSize() == AttachSlot_Max); m_compiler.clearDefines(); bool enableContours = (attach[AttachSlot_Contour] == AttachIO::ContourAttach && m_params.enableContours); if (enableContours) m_compiler.define("ENABLE_CONTOURS"); switch (attach[AttachSlot_Attribute]) { case AttachIO::ColorNormalPaletteAttach: m_compiler.define("VOXELATTRIB_PALETTE"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break; case AttachIO::ColorNormalCornerAttach: m_compiler.define("VOXELATTRIB_CORNER"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break; case AttachIO::ColorNormalDXTAttach: m_compiler.define("VOXELATTRIB_DXT"); break; default: return "Unsupported attribute attachment!"; } if (attach[AttachSlot_AO] == AttachIO::AOAttach) m_compiler.define("VOXELATTRIB_AO"); if (m_params.measureRaycastPerf) m_compiler.define("KERNEL_RAYCAST_PERF"); else m_compiler.define("KERNEL_RENDER"); if (m_params.enablePerfCounters) m_compiler.define("ENABLE_PERF_COUNTERS"); if (m_params.enableLargeReconstruction) m_compiler.define("LARGE_RECONSTRUCTION_KERNEL"); if (m_params.enableJitterLOD) m_compiler.define("JITTER_LOD"); if (m_params.visualization == Visualization_PrimaryAndShadow) m_compiler.define("ENABLE_SHADOWS"); if (!m_blurLUT.getSize()) constructBlurLUT(); m_compiler.define("BLUR_LUT_SIZE", String(m_blurLUT.getSize())); // Determine flags. U32 flags = 0; if (m_params.visualization == Visualization_IterationCount) flags |= RenderFlags_VisualizeIterations; else if (m_params.visualization == Visualization_RaycastLevel) flags |= RenderFlags_VisualizeRaycastLevel; // Set input. m_input.frameSize = frame.getSize(); m_input.flags = flags; m_input.batchSize = m_params.batchSize; m_input.aaRays = (m_params.enableAntialias) ? 4 : 1; m_input.maxVoxelSize = m_params.maxVoxelSize; m_input.brightness = m_params.brightness; m_input.coarseSize = m_params.coarseSize; m_input.coarseFrameSize = (m_input.frameSize + (m_params.coarseSize - 1)) / m_params.coarseSize + 1; m_input.frame = frame.getBuffer().getMutableCudaPtr(); m_input.rootNode = runtime->getRootNodeCuda(objectID); OctreeMatrices& om = m_input.octreeMatrices; Vec3f scale = Vec3f(Vec2f(2.0f) / Vec2f(m_input.frameSize), 1.0f); om.viewportToCamera = projection.inverted() * Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) * Mat4f::scale(scale); om.cameraToOctree = Mat4f::translate(Vec3f(1.0f)) * (worldToCamera * octreeToWorld).inverted(); Mat4f vto = om.cameraToOctree * om.viewportToCamera; om.pixelInOctree = sqrt(Vec4f(vto.col(0)).getXYZ().cross(Vec4f(vto.col(1)).getXYZ()).length()); om.octreeToWorld = octreeToWorld * Mat4f::translate(Vec3f(-1.0f)); om.worldToOctree = invert(om.octreeToWorld); om.octreeToWorldN = octreeToWorld.getXYZ().inverted().transposed(); om.cameraPosition = invert(worldToCamera) * Vec3f(0.f, 0.f, 0.f); om.octreeToViewport = invert(om.viewportToCamera) * invert(om.cameraToOctree); om.viewportToOctreeN = (om.octreeToViewport).transposed(); // Setup frame-related buffers. int numPixels = m_input.frameSize.x * m_input.frameSize.y; if (m_pixelTable.getSize() != m_input.frameSize) { m_indexToPixel.resizeDiscard(numPixels * sizeof(S32)); m_pixelTable.setSize(m_input.frameSize); memcpy(m_indexToPixel.getMutablePtr(), m_pixelTable.getIndexToPixel(), numPixels * sizeof(S32)); } // Coarse frame and pixel buffers. int coarseNumPixels = m_input.coarseFrameSize.x * m_input.coarseFrameSize.y; m_coarseFrameBuffer.resizeDiscard(coarseNumPixels * sizeof(S32)); m_input.frameCoarse = m_coarseFrameBuffer.getMutableCudaPtr(); if (m_coarsePixelTable.getSize() != m_input.coarseFrameSize) { m_coarseIndexToPixel.resizeDiscard(coarseNumPixels * sizeof(S32)); m_coarsePixelTable.setSize(m_input.coarseFrameSize); memcpy(m_coarseIndexToPixel.getMutablePtr(), m_coarsePixelTable.getIndexToPixel(), coarseNumPixels * sizeof(S32)); m_coarseIndexToPixel.free(Buffer::CPU); } // Temp frame buffer for blurring. if (m_params.enableBlur) { // override frame buffer address! m_tempFrameBuffer.resizeDiscard(numPixels * sizeof(U32)); m_input.frame = m_tempFrameBuffer.getMutableCudaPtr(); } // AA sample buffer if (m_input.aaRays > 1) { m_aaSampleBuffer.resizeDiscard(numPixels * m_input.aaRays * sizeof(U32)); m_input.aaSampleBuffer = m_aaSampleBuffer.getMutableCudaPtr(); } // Setup performance counter buffer. if (m_params.enablePerfCounters) { m_perfCounters.resizeDiscard(m_numWarps * PerfCounter_Max * 33 * sizeof(S64)); memset(m_perfCounters.getMutablePtr(), 0, (size_t)m_perfCounters.getSize()); m_input.perfCounters = m_perfCounters.getMutableCudaPtr(); } // Render. LaunchResult coarseResult; if (m_params.enableBeamOptimization) { RenderInput old = m_input; m_input.numPrimaryRays = coarseNumPixels; m_input.aaRays = 1; m_input.flags |= RenderFlags_CoarsePass; m_input.batchSize = 1; m_compiler.undef("ENABLE_CONTOURS"); coarseResult = launch(coarseNumPixels * m_params.numFrameRepeats, false); m_input = old; m_input.flags |= RenderFlags_UseCoarseData; if (enableContours) m_compiler.define("ENABLE_CONTOURS"); } m_input.numPrimaryRays = numPixels * m_input.aaRays; LaunchResult renderResult = launch(m_input.numPrimaryRays * m_params.numFrameRepeats, true); // Post-process blur. F32 blurTime = 0.f; if (m_params.enableBlur) { // restore true frame buffer pointer m_input.frame = frame.getBuffer().getMutableCudaPtr(); // get module CudaModule* module = m_compiler.compile(); // update blur LUT Vec4i* pLUT = (Vec4i*)module->getGlobal("c_blurLUT").getMutablePtr(); for (int i=0; i < m_blurLUT.getSize(); i++) { float d = sqrtf((float)sqr(m_blurLUT[i].x) + (float)sqr(m_blurLUT[i].y)); Vec4i& v = pLUT[i]; v.x = m_blurLUT[i].x; v.y = m_blurLUT[i].y; v.z = floatToBits((float)m_blurLUT[i].z); v.w = floatToBits(d); } // update globals *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input; module->setTexRef("texTempFrameIn", m_tempFrameBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4); module->setTexRef("texAASamplesIn", m_aaSampleBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4); // launch blurTime = module->getKernel("blurKernel").launchTimed(frame.getSize(), Vec2i(8)); } // Update statistics. F32 totalTime = renderResult.time + coarseResult.time + blurTime; m_results.launchTime += totalTime; m_results.coarseTime += coarseResult.time; m_results.renderWarps += renderResult.numWarps; m_results.coarseWarps += coarseResult.numWarps; if (m_params.enablePerfCounters) { const S64* ptr = (const S64*)m_perfCounters.getPtr(); for (int warpIdx = 0; warpIdx < m_numWarps; warpIdx++) { for (int counterIdx = 0; counterIdx < PerfCounter_Max; counterIdx++) { for (int threadIdx = 0; threadIdx < 32; threadIdx++) m_results.threadCounters[counterIdx] += *ptr++; m_results.warpCounters[counterIdx] += *ptr++; } } } m_stats = sprintf("CudaRenderer: launch %.2f ms (%.2f FPS), %.2f MPix/s", totalTime * 1.0e3f, 1.0f / totalTime, numPixels * 1.0e-6f / totalTime); if (m_params.enableBlur) m_stats += sprintf(", blur %.2f MPix/s", numPixels * 1.0e-6f / blurTime); // Adjust the number of warps for the next run. int maxWarps = max(renderResult.numWarps, coarseResult.numWarps); if (maxWarps * 2 > m_numWarps) { if (maxWarps == m_numWarps) printf("CudaRenderer: warp count auto-detect overflow, increasing warp count to %d\n", maxWarps * 2); else printf("CudaRenderer: warp count auto-detected: %d warps, launching %d\n", maxWarps, maxWarps * 2); m_numWarps = maxWarps * 2; } return ""; }