예제 #1
0
Mat4f GLContext::xformMouseToUser(const Mat4f& userToClip) const
{
    return
        userToClip.inverted() *
        Mat4f::scale(Vec3f(1.0f, -1.0f, 1.0f)) *
        Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) *
        Mat4f::scale(Vec3f(m_viewScale, 1.0f)) *
        Mat4f::translate(Vec3f(0.5f, 0.5f, 0.0f));
}
String CudaRenderer::renderObject(
    Image&          frame,
    OctreeRuntime*  runtime,
    int             objectID,
    const Mat4f&    octreeToWorld,
    const Mat4f&    worldToCamera,
    const Mat4f&    projection)
{
    FW_ASSERT(runtime);

    // Check frame buffer validity.

    if (frame.getSize().min() <= 0)
        return "";

    if (frame.getFormat() != ImageFormat::ABGR_8888 ||
        frame.getStride() != frame.getSize().x * frame.getBPP())
    {
        return "CudaRenderer: Incompatible framebuffer!";
    }

    // Determine preprocessor defines.

    const Array<AttachIO::AttachType>& attach = runtime->getAttachTypes(objectID);
    FW_ASSERT(attach.getSize() == AttachSlot_Max);

    m_compiler.clearDefines();

    bool enableContours = (attach[AttachSlot_Contour] == AttachIO::ContourAttach && m_params.enableContours);
    if (enableContours)
        m_compiler.define("ENABLE_CONTOURS");

    switch (attach[AttachSlot_Attribute])
    {
    case AttachIO::ColorNormalPaletteAttach:    m_compiler.define("VOXELATTRIB_PALETTE"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break;
    case AttachIO::ColorNormalCornerAttach:     m_compiler.define("VOXELATTRIB_CORNER"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break;
    case AttachIO::ColorNormalDXTAttach:        m_compiler.define("VOXELATTRIB_DXT"); break;
    default:                                    return "Unsupported attribute attachment!";
    }

    if (attach[AttachSlot_AO] == AttachIO::AOAttach)
        m_compiler.define("VOXELATTRIB_AO");

    if (m_params.measureRaycastPerf)
        m_compiler.define("KERNEL_RAYCAST_PERF");
    else
        m_compiler.define("KERNEL_RENDER");

    if (m_params.enablePerfCounters)
        m_compiler.define("ENABLE_PERF_COUNTERS");

    if (m_params.enableLargeReconstruction)
        m_compiler.define("LARGE_RECONSTRUCTION_KERNEL");

    if (m_params.enableJitterLOD)
        m_compiler.define("JITTER_LOD");

    if (m_params.visualization == Visualization_PrimaryAndShadow)
        m_compiler.define("ENABLE_SHADOWS");

    if (!m_blurLUT.getSize())
        constructBlurLUT();
    m_compiler.define("BLUR_LUT_SIZE", String(m_blurLUT.getSize()));

    // Determine flags.

    U32 flags = 0;
    if (m_params.visualization == Visualization_IterationCount)
        flags |= RenderFlags_VisualizeIterations;
    else if (m_params.visualization == Visualization_RaycastLevel)
        flags |= RenderFlags_VisualizeRaycastLevel;

    // Set input.

    m_input.frameSize       = frame.getSize();
    m_input.flags           = flags;
    m_input.batchSize       = m_params.batchSize;
    m_input.aaRays          = (m_params.enableAntialias) ? 4 : 1;
    m_input.maxVoxelSize    = m_params.maxVoxelSize;
    m_input.brightness      = m_params.brightness;
    m_input.coarseSize      = m_params.coarseSize;
    m_input.coarseFrameSize = (m_input.frameSize + (m_params.coarseSize - 1)) / m_params.coarseSize + 1;
    m_input.frame           = frame.getBuffer().getMutableCudaPtr();
    m_input.rootNode        = runtime->getRootNodeCuda(objectID);

    OctreeMatrices& om      = m_input.octreeMatrices;
    Vec3f scale             = Vec3f(Vec2f(2.0f) / Vec2f(m_input.frameSize), 1.0f);
    om.viewportToCamera     = projection.inverted() * Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) * Mat4f::scale(scale);
    om.cameraToOctree       = Mat4f::translate(Vec3f(1.0f)) * (worldToCamera * octreeToWorld).inverted();
    Mat4f vto               = om.cameraToOctree * om.viewportToCamera;
    om.pixelInOctree        = sqrt(Vec4f(vto.col(0)).getXYZ().cross(Vec4f(vto.col(1)).getXYZ()).length());

    om.octreeToWorld        = octreeToWorld * Mat4f::translate(Vec3f(-1.0f));
    om.worldToOctree        = invert(om.octreeToWorld);
    om.octreeToWorldN       = octreeToWorld.getXYZ().inverted().transposed();
    om.cameraPosition       = invert(worldToCamera) * Vec3f(0.f, 0.f, 0.f);
    om.octreeToViewport     = invert(om.viewportToCamera) * invert(om.cameraToOctree);
    om.viewportToOctreeN    = (om.octreeToViewport).transposed();

    // Setup frame-related buffers.

    int numPixels = m_input.frameSize.x * m_input.frameSize.y;
    if (m_pixelTable.getSize() != m_input.frameSize)
    {
        m_indexToPixel.resizeDiscard(numPixels * sizeof(S32));
        m_pixelTable.setSize(m_input.frameSize);
        memcpy(m_indexToPixel.getMutablePtr(), m_pixelTable.getIndexToPixel(), numPixels * sizeof(S32));
    }

    // Coarse frame and pixel buffers.

    int coarseNumPixels = m_input.coarseFrameSize.x * m_input.coarseFrameSize.y;
    m_coarseFrameBuffer.resizeDiscard(coarseNumPixels * sizeof(S32));
    m_input.frameCoarse = m_coarseFrameBuffer.getMutableCudaPtr();

    if (m_coarsePixelTable.getSize() != m_input.coarseFrameSize)
    {
        m_coarseIndexToPixel.resizeDiscard(coarseNumPixels * sizeof(S32));
        m_coarsePixelTable.setSize(m_input.coarseFrameSize);
        memcpy(m_coarseIndexToPixel.getMutablePtr(), m_coarsePixelTable.getIndexToPixel(), coarseNumPixels * sizeof(S32));
        m_coarseIndexToPixel.free(Buffer::CPU);
    }

    // Temp frame buffer for blurring.

    if (m_params.enableBlur)
    {
        // override frame buffer address!
        m_tempFrameBuffer.resizeDiscard(numPixels * sizeof(U32));
        m_input.frame = m_tempFrameBuffer.getMutableCudaPtr();
    }

    // AA sample buffer
    if (m_input.aaRays > 1)
    {
        m_aaSampleBuffer.resizeDiscard(numPixels * m_input.aaRays * sizeof(U32));
        m_input.aaSampleBuffer = m_aaSampleBuffer.getMutableCudaPtr();
    }

    // Setup performance counter buffer.

    if (m_params.enablePerfCounters)
    {
        m_perfCounters.resizeDiscard(m_numWarps * PerfCounter_Max * 33 * sizeof(S64));
        memset(m_perfCounters.getMutablePtr(), 0, (size_t)m_perfCounters.getSize());
        m_input.perfCounters = m_perfCounters.getMutableCudaPtr();
    }

    // Render.

    LaunchResult coarseResult;
    if (m_params.enableBeamOptimization)
    {
        RenderInput old = m_input;
        m_input.numPrimaryRays = coarseNumPixels;
        m_input.aaRays = 1;
        m_input.flags |= RenderFlags_CoarsePass;
        m_input.batchSize = 1;
        m_compiler.undef("ENABLE_CONTOURS");

        coarseResult = launch(coarseNumPixels * m_params.numFrameRepeats, false);

        m_input = old;
        m_input.flags |= RenderFlags_UseCoarseData;
        if (enableContours)
            m_compiler.define("ENABLE_CONTOURS");
    }

    m_input.numPrimaryRays = numPixels * m_input.aaRays;
    LaunchResult renderResult = launch(m_input.numPrimaryRays * m_params.numFrameRepeats, true);

    // Post-process blur.
    F32 blurTime = 0.f;
    if (m_params.enableBlur)
    {
        // restore true frame buffer pointer
        m_input.frame = frame.getBuffer().getMutableCudaPtr();

        // get module
        CudaModule* module = m_compiler.compile();

        // update blur LUT
        Vec4i* pLUT = (Vec4i*)module->getGlobal("c_blurLUT").getMutablePtr();
        for (int i=0; i < m_blurLUT.getSize(); i++)
        {
            float d = sqrtf((float)sqr(m_blurLUT[i].x) + (float)sqr(m_blurLUT[i].y));
            Vec4i& v = pLUT[i];
            v.x = m_blurLUT[i].x;
            v.y = m_blurLUT[i].y;
            v.z = floatToBits((float)m_blurLUT[i].z);
            v.w = floatToBits(d);
        }

        // update globals
        *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input;
        module->setTexRef("texTempFrameIn", m_tempFrameBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4);
        module->setTexRef("texAASamplesIn", m_aaSampleBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4);

        // launch
        blurTime = module->getKernel("blurKernel").launchTimed(frame.getSize(), Vec2i(8));

    }

    // Update statistics.

    F32 totalTime           = renderResult.time + coarseResult.time + blurTime;
    m_results.launchTime    += totalTime;
    m_results.coarseTime    += coarseResult.time;
    m_results.renderWarps   += renderResult.numWarps;
    m_results.coarseWarps   += coarseResult.numWarps;

    if (m_params.enablePerfCounters)
    {
        const S64* ptr = (const S64*)m_perfCounters.getPtr();
        for (int warpIdx = 0; warpIdx < m_numWarps; warpIdx++)
        {
            for (int counterIdx = 0; counterIdx < PerfCounter_Max; counterIdx++)
            {
                for (int threadIdx = 0; threadIdx < 32; threadIdx++)
                    m_results.threadCounters[counterIdx] += *ptr++;
                m_results.warpCounters[counterIdx] += *ptr++;
            }
        }
    }

    m_stats = sprintf("CudaRenderer: launch %.2f ms (%.2f FPS), %.2f MPix/s",
        totalTime * 1.0e3f,
        1.0f / totalTime,
        numPixels * 1.0e-6f / totalTime);

    if (m_params.enableBlur)
        m_stats += sprintf(", blur %.2f MPix/s", numPixels * 1.0e-6f / blurTime);

    // Adjust the number of warps for the next run.

    int maxWarps = max(renderResult.numWarps, coarseResult.numWarps);
    if (maxWarps * 2 > m_numWarps)
    {
        if (maxWarps == m_numWarps)
            printf("CudaRenderer: warp count auto-detect overflow, increasing warp count to %d\n", maxWarps * 2);
        else
            printf("CudaRenderer: warp count auto-detected: %d warps, launching %d\n", maxWarps, maxWarps * 2);
        m_numWarps = maxWarps * 2;
    }
    return "";
}