Beispiel #1
0
void CudaTracer::setKernel(const String& kernelName)
{
    // Not changed => done.

    if (m_kernelName == kernelName)
        return;
    m_kernelName = kernelName;

    // Compile kernel.

    CudaModule* module = compileKernel();

    // Initialize config with default values.
    {
        KernelConfig& c         = *(KernelConfig*)module->getGlobal("g_config").getMutablePtr();
        c.bvhLayout             = BVHLayout_Max;
        c.blockWidth            = 0;
        c.blockHeight           = 0;
        c.usePersistentThreads  = 0;
    }

    // Query config.

    module->getKernel("queryConfig").launch(1, 1);
    m_kernelConfig = *(const KernelConfig*)module->getGlobal("g_config").getPtr();
}
CudaRenderer::LaunchResult CudaRenderer::launch(int totalWork, bool persistentThreads)
{
    // Setup warps.

    Vec2i blockSize = Vec2i(RCK_TRACE_BLOCK_WIDTH, RCK_TRACE_BLOCK_HEIGHT);
    int blockThreads = blockSize.x * blockSize.y;
    Vec2i gridSize = Vec2i(0, 1);
    int activeWarpTableSize = 0;

    m_input.totalWork = totalWork;

    if (persistentThreads || m_params.enablePerfCounters)
    {
        m_compiler.define("PERSISTENT_THREADS");
        gridSize.x = (m_numWarps * 32 + blockThreads - 1) / blockThreads;
        activeWarpTableSize = (gridSize.x * gridSize.y) * ((blockThreads + 31) / 32);

        m_activeWarps.resizeDiscard(activeWarpTableSize * sizeof(S32));
        memset(m_activeWarps.getMutablePtr(), 0, activeWarpTableSize * sizeof(S32));
        m_input.activeWarps = m_activeWarps.getMutableCudaPtr();
    }
    else
    {
        m_compiler.undef("PERSISTENT_THREADS");
        gridSize.x = (totalWork + blockThreads - 1) / blockThreads;
    }

    // Compile kernel.

    CudaModule* module = m_compiler.compile();

    // Update globals.

    *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input;
    *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0;
    module->setTexRef("texIndexToPixel", m_indexToPixel, CU_AD_FORMAT_UNSIGNED_INT32, 1);
    module->setTexRef("texFrameCoarseIn", m_coarseFrameBuffer, CU_AD_FORMAT_FLOAT, 1);
    module->setTexRef("texIndexToPixelCoarse", m_coarseIndexToPixel, CU_AD_FORMAT_UNSIGNED_INT32, 1);

    // Launch.

    LaunchResult res;
    res.time = module->getKernel("kernel").launchTimed(gridSize * blockSize, blockSize, (!m_params.measureRaycastPerf));

    // Determine results.

    const S32* activeWarps = (const S32*)m_activeWarps.getPtr();
    res.numWarps = 0;
    for (int i = 0; i < activeWarpTableSize; i++)
        if (activeWarps[i])
            res.numWarps++;
    return res;
}
Beispiel #3
0
F32 CudaTracer::traceBatch(RayBuffer& rays)
{
    // No rays => done.

    int numRays = rays.getSize();
    if (!numRays)
        return 0.0f;

    // Check BVH consistency.

    if (!m_bvh)
        fail("CudaTracer: No BVH!");
    if (m_bvh->getLayout() != getDesiredBVHLayout())
        fail("CudaTracer: Incorrect BVH layout!");


    // Get BVH buffers.

    CUdeviceptr nodePtr     = m_bvh->getNodeBuffer().getCudaPtr();
    CUdeviceptr triPtr      = m_bvh->getTriWoopBuffer().getCudaPtr();
    Buffer&     indexBuf    = m_bvh->getTriIndexBuffer();
    Vec2i       nodeOfsA    = m_bvh->getNodeSubArray(0);
    Vec2i       nodeOfsB    = m_bvh->getNodeSubArray(1);
    Vec2i       nodeOfsC    = m_bvh->getNodeSubArray(2);
    Vec2i       nodeOfsD    = m_bvh->getNodeSubArray(3);
    Vec2i       triOfsA     = m_bvh->getTriWoopSubArray(0);
    Vec2i       triOfsB     = m_bvh->getTriWoopSubArray(1);
    Vec2i       triOfsC     = m_bvh->getTriWoopSubArray(2);

    // Compile kernel.

    CudaModule* module = compileKernel();
    CudaKernel kernel = module->getKernel("trace");

    // Set parameters.

    kernel.setParams(
        numRays,                                    // numRays
        (rays.getNeedClosestHit()) ? 0 : 1,         // anyHit
        rays.getRayBuffer().getCudaPtr(),           // rays
        rays.getResultBuffer().getMutableCudaPtr(), // results
        nodePtr + nodeOfsA.x,                       // nodesA
        nodePtr + nodeOfsB.x,                       // nodesB
        nodePtr + nodeOfsC.x,                       // nodesC
        nodePtr + nodeOfsD.x,                       // nodesD
        triPtr + triOfsA.x,                         // trisA
        triPtr + triOfsB.x,                         // trisB
        triPtr + triOfsC.x,                         // trisC
        indexBuf.getCudaPtr());                     // triIndices

    // Set texture references.

    module->setTexRef("t_rays", rays.getRayBuffer(), CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesA", nodePtr + nodeOfsA.x, nodeOfsA.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesB", nodePtr + nodeOfsB.x, nodeOfsB.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesC", nodePtr + nodeOfsC.x, nodeOfsC.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_nodesD", nodePtr + nodeOfsD.x, nodeOfsD.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisA", triPtr + triOfsA.x, triOfsA.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisB", triPtr + triOfsB.x, triOfsB.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_trisC", triPtr + triOfsC.x, triOfsC.y, CU_AD_FORMAT_FLOAT, 4);
    module->setTexRef("t_triIndices", indexBuf, CU_AD_FORMAT_SIGNED_INT32, 1);

    // Determine block and grid sizes.

    int desiredWarps = (numRays + 31) / 32;
    if (m_kernelConfig.usePersistentThreads != 0)
    {
        *(S32*)module->getGlobal("g_warpCounter").getMutablePtr() = 0;
        desiredWarps = 720; // Tesla: 30 SMs * 24 warps, Fermi: 15 SMs * 48 warps
    }

    Vec2i blockSize(m_kernelConfig.blockWidth, m_kernelConfig.blockHeight);
    int blockWarps = (blockSize.x * blockSize.y + 31) / 32;
    int numBlocks = (desiredWarps + blockWarps - 1) / blockWarps;

    // Launch.

    return kernel.launchTimed(numBlocks * blockSize.x * blockSize.y, blockSize);
}
String CudaRenderer::renderObject(
    Image&          frame,
    OctreeRuntime*  runtime,
    int             objectID,
    const Mat4f&    octreeToWorld,
    const Mat4f&    worldToCamera,
    const Mat4f&    projection)
{
    FW_ASSERT(runtime);

    // Check frame buffer validity.

    if (frame.getSize().min() <= 0)
        return "";

    if (frame.getFormat() != ImageFormat::ABGR_8888 ||
        frame.getStride() != frame.getSize().x * frame.getBPP())
    {
        return "CudaRenderer: Incompatible framebuffer!";
    }

    // Determine preprocessor defines.

    const Array<AttachIO::AttachType>& attach = runtime->getAttachTypes(objectID);
    FW_ASSERT(attach.getSize() == AttachSlot_Max);

    m_compiler.clearDefines();

    bool enableContours = (attach[AttachSlot_Contour] == AttachIO::ContourAttach && m_params.enableContours);
    if (enableContours)
        m_compiler.define("ENABLE_CONTOURS");

    switch (attach[AttachSlot_Attribute])
    {
    case AttachIO::ColorNormalPaletteAttach:    m_compiler.define("VOXELATTRIB_PALETTE"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break;
    case AttachIO::ColorNormalCornerAttach:     m_compiler.define("VOXELATTRIB_CORNER"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break;
    case AttachIO::ColorNormalDXTAttach:        m_compiler.define("VOXELATTRIB_DXT"); break;
    default:                                    return "Unsupported attribute attachment!";
    }

    if (attach[AttachSlot_AO] == AttachIO::AOAttach)
        m_compiler.define("VOXELATTRIB_AO");

    if (m_params.measureRaycastPerf)
        m_compiler.define("KERNEL_RAYCAST_PERF");
    else
        m_compiler.define("KERNEL_RENDER");

    if (m_params.enablePerfCounters)
        m_compiler.define("ENABLE_PERF_COUNTERS");

    if (m_params.enableLargeReconstruction)
        m_compiler.define("LARGE_RECONSTRUCTION_KERNEL");

    if (m_params.enableJitterLOD)
        m_compiler.define("JITTER_LOD");

    if (m_params.visualization == Visualization_PrimaryAndShadow)
        m_compiler.define("ENABLE_SHADOWS");

    if (!m_blurLUT.getSize())
        constructBlurLUT();
    m_compiler.define("BLUR_LUT_SIZE", String(m_blurLUT.getSize()));

    // Determine flags.

    U32 flags = 0;
    if (m_params.visualization == Visualization_IterationCount)
        flags |= RenderFlags_VisualizeIterations;
    else if (m_params.visualization == Visualization_RaycastLevel)
        flags |= RenderFlags_VisualizeRaycastLevel;

    // Set input.

    m_input.frameSize       = frame.getSize();
    m_input.flags           = flags;
    m_input.batchSize       = m_params.batchSize;
    m_input.aaRays          = (m_params.enableAntialias) ? 4 : 1;
    m_input.maxVoxelSize    = m_params.maxVoxelSize;
    m_input.brightness      = m_params.brightness;
    m_input.coarseSize      = m_params.coarseSize;
    m_input.coarseFrameSize = (m_input.frameSize + (m_params.coarseSize - 1)) / m_params.coarseSize + 1;
    m_input.frame           = frame.getBuffer().getMutableCudaPtr();
    m_input.rootNode        = runtime->getRootNodeCuda(objectID);

    OctreeMatrices& om      = m_input.octreeMatrices;
    Vec3f scale             = Vec3f(Vec2f(2.0f) / Vec2f(m_input.frameSize), 1.0f);
    om.viewportToCamera     = projection.inverted() * Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) * Mat4f::scale(scale);
    om.cameraToOctree       = Mat4f::translate(Vec3f(1.0f)) * (worldToCamera * octreeToWorld).inverted();
    Mat4f vto               = om.cameraToOctree * om.viewportToCamera;
    om.pixelInOctree        = sqrt(Vec4f(vto.col(0)).getXYZ().cross(Vec4f(vto.col(1)).getXYZ()).length());

    om.octreeToWorld        = octreeToWorld * Mat4f::translate(Vec3f(-1.0f));
    om.worldToOctree        = invert(om.octreeToWorld);
    om.octreeToWorldN       = octreeToWorld.getXYZ().inverted().transposed();
    om.cameraPosition       = invert(worldToCamera) * Vec3f(0.f, 0.f, 0.f);
    om.octreeToViewport     = invert(om.viewportToCamera) * invert(om.cameraToOctree);
    om.viewportToOctreeN    = (om.octreeToViewport).transposed();

    // Setup frame-related buffers.

    int numPixels = m_input.frameSize.x * m_input.frameSize.y;
    if (m_pixelTable.getSize() != m_input.frameSize)
    {
        m_indexToPixel.resizeDiscard(numPixels * sizeof(S32));
        m_pixelTable.setSize(m_input.frameSize);
        memcpy(m_indexToPixel.getMutablePtr(), m_pixelTable.getIndexToPixel(), numPixels * sizeof(S32));
    }

    // Coarse frame and pixel buffers.

    int coarseNumPixels = m_input.coarseFrameSize.x * m_input.coarseFrameSize.y;
    m_coarseFrameBuffer.resizeDiscard(coarseNumPixels * sizeof(S32));
    m_input.frameCoarse = m_coarseFrameBuffer.getMutableCudaPtr();

    if (m_coarsePixelTable.getSize() != m_input.coarseFrameSize)
    {
        m_coarseIndexToPixel.resizeDiscard(coarseNumPixels * sizeof(S32));
        m_coarsePixelTable.setSize(m_input.coarseFrameSize);
        memcpy(m_coarseIndexToPixel.getMutablePtr(), m_coarsePixelTable.getIndexToPixel(), coarseNumPixels * sizeof(S32));
        m_coarseIndexToPixel.free(Buffer::CPU);
    }

    // Temp frame buffer for blurring.

    if (m_params.enableBlur)
    {
        // override frame buffer address!
        m_tempFrameBuffer.resizeDiscard(numPixels * sizeof(U32));
        m_input.frame = m_tempFrameBuffer.getMutableCudaPtr();
    }

    // AA sample buffer
    if (m_input.aaRays > 1)
    {
        m_aaSampleBuffer.resizeDiscard(numPixels * m_input.aaRays * sizeof(U32));
        m_input.aaSampleBuffer = m_aaSampleBuffer.getMutableCudaPtr();
    }

    // Setup performance counter buffer.

    if (m_params.enablePerfCounters)
    {
        m_perfCounters.resizeDiscard(m_numWarps * PerfCounter_Max * 33 * sizeof(S64));
        memset(m_perfCounters.getMutablePtr(), 0, (size_t)m_perfCounters.getSize());
        m_input.perfCounters = m_perfCounters.getMutableCudaPtr();
    }

    // Render.

    LaunchResult coarseResult;
    if (m_params.enableBeamOptimization)
    {
        RenderInput old = m_input;
        m_input.numPrimaryRays = coarseNumPixels;
        m_input.aaRays = 1;
        m_input.flags |= RenderFlags_CoarsePass;
        m_input.batchSize = 1;
        m_compiler.undef("ENABLE_CONTOURS");

        coarseResult = launch(coarseNumPixels * m_params.numFrameRepeats, false);

        m_input = old;
        m_input.flags |= RenderFlags_UseCoarseData;
        if (enableContours)
            m_compiler.define("ENABLE_CONTOURS");
    }

    m_input.numPrimaryRays = numPixels * m_input.aaRays;
    LaunchResult renderResult = launch(m_input.numPrimaryRays * m_params.numFrameRepeats, true);

    // Post-process blur.
    F32 blurTime = 0.f;
    if (m_params.enableBlur)
    {
        // restore true frame buffer pointer
        m_input.frame = frame.getBuffer().getMutableCudaPtr();

        // get module
        CudaModule* module = m_compiler.compile();

        // update blur LUT
        Vec4i* pLUT = (Vec4i*)module->getGlobal("c_blurLUT").getMutablePtr();
        for (int i=0; i < m_blurLUT.getSize(); i++)
        {
            float d = sqrtf((float)sqr(m_blurLUT[i].x) + (float)sqr(m_blurLUT[i].y));
            Vec4i& v = pLUT[i];
            v.x = m_blurLUT[i].x;
            v.y = m_blurLUT[i].y;
            v.z = floatToBits((float)m_blurLUT[i].z);
            v.w = floatToBits(d);
        }

        // update globals
        *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input;
        module->setTexRef("texTempFrameIn", m_tempFrameBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4);
        module->setTexRef("texAASamplesIn", m_aaSampleBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4);

        // launch
        blurTime = module->getKernel("blurKernel").launchTimed(frame.getSize(), Vec2i(8));

    }

    // Update statistics.

    F32 totalTime           = renderResult.time + coarseResult.time + blurTime;
    m_results.launchTime    += totalTime;
    m_results.coarseTime    += coarseResult.time;
    m_results.renderWarps   += renderResult.numWarps;
    m_results.coarseWarps   += coarseResult.numWarps;

    if (m_params.enablePerfCounters)
    {
        const S64* ptr = (const S64*)m_perfCounters.getPtr();
        for (int warpIdx = 0; warpIdx < m_numWarps; warpIdx++)
        {
            for (int counterIdx = 0; counterIdx < PerfCounter_Max; counterIdx++)
            {
                for (int threadIdx = 0; threadIdx < 32; threadIdx++)
                    m_results.threadCounters[counterIdx] += *ptr++;
                m_results.warpCounters[counterIdx] += *ptr++;
            }
        }
    }

    m_stats = sprintf("CudaRenderer: launch %.2f ms (%.2f FPS), %.2f MPix/s",
        totalTime * 1.0e3f,
        1.0f / totalTime,
        numPixels * 1.0e-6f / totalTime);

    if (m_params.enableBlur)
        m_stats += sprintf(", blur %.2f MPix/s", numPixels * 1.0e-6f / blurTime);

    // Adjust the number of warps for the next run.

    int maxWarps = max(renderResult.numWarps, coarseResult.numWarps);
    if (maxWarps * 2 > m_numWarps)
    {
        if (maxWarps == m_numWarps)
            printf("CudaRenderer: warp count auto-detect overflow, increasing warp count to %d\n", maxWarps * 2);
        else
            printf("CudaRenderer: warp count auto-detected: %d warps, launching %d\n", maxWarps, maxWarps * 2);
        m_numWarps = maxWarps * 2;
    }
    return "";
}