void BVH::trace(RayBuffer& rays, RayStats* stats) const
{
    for(S32 i=0;i<rays.getSize();i++)
    {
        Ray ray = rays.getRayForSlot(i);    // takes a local copy
        RayResult& result = rays.getMutableResultForSlot(i);

        result.clear();

        currentTreelet = -2;
        uniqueTreelets.clear();

        if(stats)
        {
            stats->platform = m_platform;
            stats->numRays++;
        }

        traceRecursive(m_root, ray,result,rays.getNeedClosestHit(), stats);
    }
}
Esempio n. 2
0
F32 CudaTracer::traceBatch(RayBuffer& rays)
{
    // No rays => done.

    int numRays = rays.getSize();
    if (!numRays)
        return 0.0f;

    // Check BVH consistency.

    if (!m_bvh)
        fail("CudaTracer: No BVH!");
    if (m_bvh->getLayout() != getDesiredBVHLayout())
        fail("CudaTracer: Incorrect BVH layout!");


    // Get BVH buffers.

    CUdeviceptr nodePtr     = m_bvh->getNodeBuffer().getCudaPtr();
    CUdeviceptr triPtr      = m_bvh->getTriWoopBuffer().getCudaPtr();
    Buffer&     indexBuf    = m_bvh->getTriIndexBuffer();
    Vec2i       nodeOfsA    = m_bvh->getNodeSubArray(0);
    Vec2i       nodeOfsB    = m_bvh->getNodeSubArray(1);
    Vec2i       nodeOfsC    = m_bvh->getNodeSubArray(2);
    Vec2i       nodeOfsD    = m_bvh->getNodeSubArray(3);
    Vec2i       triOfsA     = m_bvh->getTriWoopSubArray(0);
    Vec2i       triOfsB     = m_bvh->getTriWoopSubArray(1);
    Vec2i       triOfsC     = m_bvh->getTriWoopSubArray(2);

    // Compile kernel.

    CudaModule* module = compileKernel();
    CudaKernel kernel = module->getKernel("trace");

    // Set parameters.

    kernel.setParams(
        numRays,                                    // numRays
        (rays.getNeedClosestHit()) ? 0 : 1,         // anyHit
        rays.getRayBuffer().getCudaPtr(),           // rays
        rays.getResultBuffer().getMutableCudaPtr(), // results
        nodePtr + nodeOfsA.x,                       // nodesA
        nodePtr + nodeOfsB.x,                       // nodesB
        nodePtr + nodeOfsC.x,                       // nodesC
        nodePtr + nodeOfsD.x,                       // nodesD
        triPtr + triOfsA.x,                         // trisA
        triPtr + triOfsB.x,                         // trisB
        triPtr + triOfsC.x,                         // trisC
        indexBuf.getCudaPtr());                     // triIndices

    // Set texture references.

    module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesB", nodePtr + nodeOfsB.x, nodeOfsB.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesC", nodePtr + nodeOfsC.x, nodeOfsC.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesD", nodePtr + nodeOfsD.x, nodeOfsD.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisB", triPtr + triOfsB.x, triOfsB.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisC", triPtr + triOfsC.x, triOfsC.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);

    // Determine block and grid sizes.

    int desiredWarps = (numRays + 31) / 32;
    if (m_kernelConfig.usePersistentThreads != 0)
    {
        *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0;
        desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
    }

    Vec2i blockSize(m_kernelConfig.blockWidth, m_kernelConfig.blockHeight);
    int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
    int numBlocks = (desiredWarps + blockWarps - 1) / blockWarps;

    // Launch.

    return kernel.launchTimed(numBlocks * blockSize.x * blockSize.y, blockSize);
}