void BVH::trace(RayBuffer& rays, RayStats* stats) const { for(S32 i=0;i<rays.getSize();i++) { Ray ray = rays.getRayForSlot(i); // takes a local copy RayResult& result = rays.getMutableResultForSlot(i); result.clear(); currentTreelet = -2; uniqueTreelets.clear(); if(stats) { stats->platform = m_platform; stats->numRays++; } traceRecursive(m_root, ray,result,rays.getNeedClosestHit(), stats); } }
F32 CudaTracer::traceBatch(RayBuffer& rays) { // No rays => done. int numRays = rays.getSize(); if (!numRays) return 0.0f; // Check BVH consistency. if (!m_bvh) fail("CudaTracer: No BVH!"); if (m_bvh->getLayout() != getDesiredBVHLayout()) fail("CudaTracer: Incorrect BVH layout!"); // Get BVH buffers. CUdeviceptr nodePtr = m_bvh->getNodeBuffer().getCudaPtr(); CUdeviceptr triPtr = m_bvh->getTriWoopBuffer().getCudaPtr(); Buffer& indexBuf = m_bvh->getTriIndexBuffer(); Vec2i nodeOfsA = m_bvh->getNodeSubArray(0); Vec2i nodeOfsB = m_bvh->getNodeSubArray(1); Vec2i nodeOfsC = m_bvh->getNodeSubArray(2); Vec2i nodeOfsD = m_bvh->getNodeSubArray(3); Vec2i triOfsA = m_bvh->getTriWoopSubArray(0); Vec2i triOfsB = m_bvh->getTriWoopSubArray(1); Vec2i triOfsC = m_bvh->getTriWoopSubArray(2); // Compile kernel. CudaModule* module = compileKernel(); CudaKernel kernel = module->getKernel("trace"); // Set parameters. kernel.setParams( numRays, // numRays (rays.getNeedClosestHit()) ? 0 : 1, // anyHit rays.getRayBuffer().getCudaPtr(), // rays rays.getResultBuffer().getMutableCudaPtr(), // results nodePtr + nodeOfsA.x, // nodesA nodePtr + nodeOfsB.x, // nodesB nodePtr + nodeOfsC.x, // nodesC nodePtr + nodeOfsD.x, // nodesD triPtr + triOfsA.x, // trisA triPtr + triOfsB.x, // trisB triPtr + triOfsC.x, // trisC indexBuf.getCudaPtr()); // triIndices // Set texture references. module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesB", nodePtr + nodeOfsB.x, nodeOfsB.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesC", nodePtr + nodeOfsC.x, nodeOfsC.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_nodesD", nodePtr + nodeOfsD.x, nodeOfsD.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisB", triPtr + triOfsB.x, triOfsB.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_trisC", triPtr + triOfsC.x, triOfsC.y, CU_AD_FORMAT_FLOAT, 4); module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1); // Determine block and grid sizes. int desiredWarps = (numRays + 31) / 32; if (m_kernelConfig.usePersistentThreads != 0) { *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0; desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps } Vec2i blockSize(m_kernelConfig.blockWidth, m_kernelConfig.blockHeight); int blockWarps = (blockSize.x * blockSize.y + 31) / 32; int numBlocks = (desiredWarps + blockWarps - 1) / blockWarps; // Launch. return kernel.launchTimed(numBlocks * blockSize.x * blockSize.y, blockSize); }