Mat4f GLContext::xformMouseToUser(const Mat4f& userToClip) const { return userToClip.inverted() * Mat4f::scale(Vec3f(1.0f, -1.0f, 1.0f)) * Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) * Mat4f::scale(Vec3f(m_viewScale, 1.0f)) * Mat4f::translate(Vec3f(0.5f, 0.5f, 0.0f)); }
String CudaRenderer::renderObject( Image& frame, OctreeRuntime* runtime, int objectID, const Mat4f& octreeToWorld, const Mat4f& worldToCamera, const Mat4f& projection) { FW_ASSERT(runtime); // Check frame buffer validity. if (frame.getSize().min() <= 0) return ""; if (frame.getFormat() != ImageFormat::ABGR_8888 || frame.getStride() != frame.getSize().x * frame.getBPP()) { return "CudaRenderer: Incompatible framebuffer!"; } // Determine preprocessor defines. const Array<AttachIO::AttachType>& attach = runtime->getAttachTypes(objectID); FW_ASSERT(attach.getSize() == AttachSlot_Max); m_compiler.clearDefines(); bool enableContours = (attach[AttachSlot_Contour] == AttachIO::ContourAttach && m_params.enableContours); if (enableContours) m_compiler.define("ENABLE_CONTOURS"); switch (attach[AttachSlot_Attribute]) { case AttachIO::ColorNormalPaletteAttach: m_compiler.define("VOXELATTRIB_PALETTE"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break; case AttachIO::ColorNormalCornerAttach: m_compiler.define("VOXELATTRIB_CORNER"); m_compiler.define("DISABLE_PUSH_OPTIMIZATION"); break; case AttachIO::ColorNormalDXTAttach: m_compiler.define("VOXELATTRIB_DXT"); break; default: return "Unsupported attribute attachment!"; } if (attach[AttachSlot_AO] == AttachIO::AOAttach) m_compiler.define("VOXELATTRIB_AO"); if (m_params.measureRaycastPerf) m_compiler.define("KERNEL_RAYCAST_PERF"); else m_compiler.define("KERNEL_RENDER"); if (m_params.enablePerfCounters) m_compiler.define("ENABLE_PERF_COUNTERS"); if (m_params.enableLargeReconstruction) m_compiler.define("LARGE_RECONSTRUCTION_KERNEL"); if (m_params.enableJitterLOD) m_compiler.define("JITTER_LOD"); if (m_params.visualization == Visualization_PrimaryAndShadow) m_compiler.define("ENABLE_SHADOWS"); if (!m_blurLUT.getSize()) constructBlurLUT(); m_compiler.define("BLUR_LUT_SIZE", String(m_blurLUT.getSize())); // Determine flags. U32 flags = 0; if (m_params.visualization == Visualization_IterationCount) flags |= RenderFlags_VisualizeIterations; else if (m_params.visualization == Visualization_RaycastLevel) flags |= RenderFlags_VisualizeRaycastLevel; // Set input. m_input.frameSize = frame.getSize(); m_input.flags = flags; m_input.batchSize = m_params.batchSize; m_input.aaRays = (m_params.enableAntialias) ? 4 : 1; m_input.maxVoxelSize = m_params.maxVoxelSize; m_input.brightness = m_params.brightness; m_input.coarseSize = m_params.coarseSize; m_input.coarseFrameSize = (m_input.frameSize + (m_params.coarseSize - 1)) / m_params.coarseSize + 1; m_input.frame = frame.getBuffer().getMutableCudaPtr(); m_input.rootNode = runtime->getRootNodeCuda(objectID); OctreeMatrices& om = m_input.octreeMatrices; Vec3f scale = Vec3f(Vec2f(2.0f) / Vec2f(m_input.frameSize), 1.0f); om.viewportToCamera = projection.inverted() * Mat4f::translate(Vec3f(-1.0f, -1.0f, 0.0f)) * Mat4f::scale(scale); om.cameraToOctree = Mat4f::translate(Vec3f(1.0f)) * (worldToCamera * octreeToWorld).inverted(); Mat4f vto = om.cameraToOctree * om.viewportToCamera; om.pixelInOctree = sqrt(Vec4f(vto.col(0)).getXYZ().cross(Vec4f(vto.col(1)).getXYZ()).length()); om.octreeToWorld = octreeToWorld * Mat4f::translate(Vec3f(-1.0f)); om.worldToOctree = invert(om.octreeToWorld); om.octreeToWorldN = octreeToWorld.getXYZ().inverted().transposed(); om.cameraPosition = invert(worldToCamera) * Vec3f(0.f, 0.f, 0.f); om.octreeToViewport = invert(om.viewportToCamera) * invert(om.cameraToOctree); om.viewportToOctreeN = (om.octreeToViewport).transposed(); // Setup frame-related buffers. int numPixels = m_input.frameSize.x * m_input.frameSize.y; if (m_pixelTable.getSize() != m_input.frameSize) { m_indexToPixel.resizeDiscard(numPixels * sizeof(S32)); m_pixelTable.setSize(m_input.frameSize); memcpy(m_indexToPixel.getMutablePtr(), m_pixelTable.getIndexToPixel(), numPixels * sizeof(S32)); } // Coarse frame and pixel buffers. int coarseNumPixels = m_input.coarseFrameSize.x * m_input.coarseFrameSize.y; m_coarseFrameBuffer.resizeDiscard(coarseNumPixels * sizeof(S32)); m_input.frameCoarse = m_coarseFrameBuffer.getMutableCudaPtr(); if (m_coarsePixelTable.getSize() != m_input.coarseFrameSize) { m_coarseIndexToPixel.resizeDiscard(coarseNumPixels * sizeof(S32)); m_coarsePixelTable.setSize(m_input.coarseFrameSize); memcpy(m_coarseIndexToPixel.getMutablePtr(), m_coarsePixelTable.getIndexToPixel(), coarseNumPixels * sizeof(S32)); m_coarseIndexToPixel.free(Buffer::CPU); } // Temp frame buffer for blurring. if (m_params.enableBlur) { // override frame buffer address! m_tempFrameBuffer.resizeDiscard(numPixels * sizeof(U32)); m_input.frame = m_tempFrameBuffer.getMutableCudaPtr(); } // AA sample buffer if (m_input.aaRays > 1) { m_aaSampleBuffer.resizeDiscard(numPixels * m_input.aaRays * sizeof(U32)); m_input.aaSampleBuffer = m_aaSampleBuffer.getMutableCudaPtr(); } // Setup performance counter buffer. if (m_params.enablePerfCounters) { m_perfCounters.resizeDiscard(m_numWarps * PerfCounter_Max * 33 * sizeof(S64)); memset(m_perfCounters.getMutablePtr(), 0, (size_t)m_perfCounters.getSize()); m_input.perfCounters = m_perfCounters.getMutableCudaPtr(); } // Render. LaunchResult coarseResult; if (m_params.enableBeamOptimization) { RenderInput old = m_input; m_input.numPrimaryRays = coarseNumPixels; m_input.aaRays = 1; m_input.flags |= RenderFlags_CoarsePass; m_input.batchSize = 1; m_compiler.undef("ENABLE_CONTOURS"); coarseResult = launch(coarseNumPixels * m_params.numFrameRepeats, false); m_input = old; m_input.flags |= RenderFlags_UseCoarseData; if (enableContours) m_compiler.define("ENABLE_CONTOURS"); } m_input.numPrimaryRays = numPixels * m_input.aaRays; LaunchResult renderResult = launch(m_input.numPrimaryRays * m_params.numFrameRepeats, true); // Post-process blur. F32 blurTime = 0.f; if (m_params.enableBlur) { // restore true frame buffer pointer m_input.frame = frame.getBuffer().getMutableCudaPtr(); // get module CudaModule* module = m_compiler.compile(); // update blur LUT Vec4i* pLUT = (Vec4i*)module->getGlobal("c_blurLUT").getMutablePtr(); for (int i=0; i < m_blurLUT.getSize(); i++) { float d = sqrtf((float)sqr(m_blurLUT[i].x) + (float)sqr(m_blurLUT[i].y)); Vec4i& v = pLUT[i]; v.x = m_blurLUT[i].x; v.y = m_blurLUT[i].y; v.z = floatToBits((float)m_blurLUT[i].z); v.w = floatToBits(d); } // update globals *(RenderInput*)module->getGlobal("c_input").getMutablePtr() = m_input; module->setTexRef("texTempFrameIn", m_tempFrameBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4); module->setTexRef("texAASamplesIn", m_aaSampleBuffer, CU_AD_FORMAT_UNSIGNED_INT8, 4); // launch blurTime = module->getKernel("blurKernel").launchTimed(frame.getSize(), Vec2i(8)); } // Update statistics. F32 totalTime = renderResult.time + coarseResult.time + blurTime; m_results.launchTime += totalTime; m_results.coarseTime += coarseResult.time; m_results.renderWarps += renderResult.numWarps; m_results.coarseWarps += coarseResult.numWarps; if (m_params.enablePerfCounters) { const S64* ptr = (const S64*)m_perfCounters.getPtr(); for (int warpIdx = 0; warpIdx < m_numWarps; warpIdx++) { for (int counterIdx = 0; counterIdx < PerfCounter_Max; counterIdx++) { for (int threadIdx = 0; threadIdx < 32; threadIdx++) m_results.threadCounters[counterIdx] += *ptr++; m_results.warpCounters[counterIdx] += *ptr++; } } } m_stats = sprintf("CudaRenderer: launch %.2f ms (%.2f FPS), %.2f MPix/s", totalTime * 1.0e3f, 1.0f / totalTime, numPixels * 1.0e-6f / totalTime); if (m_params.enableBlur) m_stats += sprintf(", blur %.2f MPix/s", numPixels * 1.0e-6f / blurTime); // Adjust the number of warps for the next run. int maxWarps = max(renderResult.numWarps, coarseResult.numWarps); if (maxWarps * 2 > m_numWarps) { if (maxWarps == m_numWarps) printf("CudaRenderer: warp count auto-detect overflow, increasing warp count to %d\n", maxWarps * 2); else printf("CudaRenderer: warp count auto-detected: %d warps, launching %d\n", maxWarps, maxWarps * 2); m_numWarps = maxWarps * 2; } return ""; }