void pcl::gpu::DeviceMemory2D::release() { if( refcount_ && CV_XADD(refcount_, -1) == 1 ) { //cv::fastFree(refcount); delete refcount_; cudaSafeCall( cudaFree(data_) ); } colsBytes_ = 0; rows_ = 0; data_ = 0; step_ = 0; refcount_ = 0; }
void CGLUtil::setCudaDeviceForGLInteroperation() { cudaDeviceProp sProp; memset( &sProp, 0, sizeof( cudaDeviceProp ) ); sProp.major = 1; sProp.minor = 0; int nDev; cudaSafeCall( cudaChooseDevice( &nDev, &sProp ) ); // tell CUDA which nDev we will be using for graphic interop // from the programming guide: Interoperability with OpenGL // requires that the CUDA nDeviceNO_ be specified by // cudaGLSetGLDevice() before any other runtime calls. //cudaSafeCall( cudaGLSetGLDevice( nDev ) ; return; }//setCudaDeviceForGLInteroperation()
void CGLUtil::printShortCudaDeviceInfo(int nDeviceNO_) { int nDeviceCount = getCudaEnabledDeviceCount(); bool valid = (nDeviceNO_ >= 0) && (nDeviceNO_ < nDeviceCount); int beg = valid ? nDeviceNO_ : 0; int end = valid ? nDeviceNO_+1 : nDeviceCount; int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : ""; printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, /*convertSMVer2Cores(prop.major, prop.minor) **/ prop.multiProcessorCount); printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); } fflush(stdout); }
void vm::scanner::cuda::DeviceMemory::create(size_t sizeBytes_arg) { if (sizeBytes_arg == sizeBytes_) return; if( sizeBytes_arg > 0) { if( data_ ) release(); sizeBytes_ = sizeBytes_arg; cudaSafeCall( cudaMalloc(&data_, sizeBytes_) ); //refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_)); refcount_ = new int; *refcount_ = 1; } }
void vm::scanner::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) { if (colsBytes_ == colsBytes_arg && rows_ == rows_arg) return; if( rows_arg > 0 && colsBytes_arg > 0) { if( data_ ) release(); colsBytes_ = colsBytes_arg; rows_ = rows_arg; cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) ); //refcount = (int*)cv::fastMalloc(sizeof(*refcount)); refcount_ = new int; *refcount_ = 1; } }
Mat6f get(Vec6f& b) { cudaSafeCall( cudaStreamSynchronize(stream) ); Mat6f A; float *data_A = A.val; float *data_b = b.val; int shift = 0; for (int i = 0; i < 6; ++i) //rows for (int j = i; j < 7; ++j) // cols + b { float value = locked_buffer.data[shift++]; if (j == 6) // vector b data_b[i] = value; else data_A[j * 6 + i] = data_A[i * 6 + j] = value; } return A; }
void pcl::gpu::DeviceMemory::create(size_t sizeBytes_arg) { if (sizeBytes_arg == sizeBytes_) return; if( sizeBytes_arg > 0) { if( data_ ) release(); sizeBytes_ = sizeBytes_arg; printf( "[CUDA] Allocating memory %d bytes.\n", sizeBytes_ ); cudaSafeCall( cudaMalloc(&data_, sizeBytes_) ); //refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_)); refcount_ = new int; *refcount_ = 1; } }
void pcl::gpu::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) { if (colsBytes_ == colsBytes_arg && rows_ == rows_arg) return; if( rows_arg > 0 && colsBytes_arg > 0) { if( data_ ) release(); colsBytes_ = colsBytes_arg; rows_ = rows_arg; printf( "[CUDA] Allocating memory %d x %d = %d bytes.\n", colsBytes_, rows_, colsBytes_ * rows_ ); cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) ); //refcount = (int*)cv::fastMalloc(sizeof(*refcount)); refcount_ = new int; *refcount_ = 1; } }
// Write positions or velocities of the particles into the GPU memory void ParticleSystem::setArray(ParticleArray array, const float* data, int start, int count) { if(array == ArrayPositions) { cudaGraphicsUnregisterResource(mCudaVboResourceParticlePositions); glBindBuffer(GL_ARRAY_BUFFER, mVboParticlePositions); glBufferSubData(GL_ARRAY_BUFFER, start*4*sizeof(float), count*4*sizeof(float), data); glBindBuffer(GL_ARRAY_BUFFER, 0); cudaGraphicsGLRegisterBuffer(&mCudaVboResourceParticlePositions, mVboParticlePositions, cudaGraphicsMapFlagsNone); } else if(array == ArrayVelocities) { cudaSafeCall(cudaMemcpy( (char*) mDeviceParticleVel + start*4*sizeof(float), // destination data, // source count*4*sizeof(float), // count cudaMemcpyHostToDevice // copy-kind )); } }
GLuint CGLUtil::gpuMapRgb2PixelBufferObj(const cv::cuda::GpuMat& cvgmRGB_ ){ //http://rickarkin.blogspot.co.uk/2012/03/use-pbo-to-share-buffer-between-cuda.html int nPyrLevel_ = getLevel( cvgmRGB_.cols ); GLuint uTexture; // map OpenGL buffer object for writing from CUDA if (cvgmRGB_.channels() == 3) { uTexture = _auTexture[nPyrLevel_]; void *pDev; cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0)); size_t nSize; cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceRGBPxielBO[nPyrLevel_])); cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC3, pDev); cvgmRGB_.copyTo(cvgmRGBA); cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0) ); //texture mapping glBindTexture( GL_TEXTURE_2D, uTexture); glBindBuffer ( GL_PIXEL_UNPACK_BUFFER_ARB, _auRGBPixelBO[nPyrLevel_]); glTexImage2D( GL_TEXTURE_2D, 0, GL_RGB, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL); errorDetectorGL(); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindTexture(GL_TEXTURE_2D, 0); } else if (cvgmRGB_.channels()==1) { uTexture = _auGrayTexture[nPyrLevel_]; void *pDev; cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0)); size_t nSize; cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceGrayPxielBO[nPyrLevel_])); cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC1, pDev); cvgmRGB_.copyTo(cvgmRGBA); cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0) ); //texture mapping glBindTexture(GL_TEXTURE_2D, uTexture); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, _auGrayPixelBO[nPyrLevel_]); glTexImage2D(GL_TEXTURE_2D, 0, GL_RED, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RED, GL_UNSIGNED_BYTE, NULL); errorDetectorGL(); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindTexture(GL_TEXTURE_2D, 0); } return uTexture; }//gpuMapRgb2PixelBufferObj
void vm::scanner::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const { cudaSafeCall( cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); }
bool inline TrackerInterface::process() { if(firstRun) { cudaSafeCall(cudaSetDevice(ConfigArgs::get().gpu)); firstRun = false; } if(!threadPack.pauseCapture.getValue()) { TICK(threadIdentifier); uint64_t start = Stopwatch::getCurrentSystemTime(); bool returnVal = true; bool shouldEnd = endRequested.getValue(); if(!logRead->grabNext(returnVal, currentFrame) || shouldEnd) { threadPack.pauseCapture.assignValue(true); threadPack.finalised.assignValue(true); finalise(); while(!threadPack.cloudSliceProcessorFinished.getValueWait()) { frontend->cloudSignal.notify_all(); } return shouldEnd ? false : returnVal; } depth.data = (unsigned short *)logRead->decompressedDepth; rgb24.data = (PixelRGB *)logRead->decompressedImage; currentFrame++; depth.step = Resolution::get().width() * 2; depth.rows = Resolution::get().rows(); depth.cols = Resolution::get().cols(); rgb24.step = Resolution::get().width() * 3; rgb24.rows = Resolution::get().rows(); rgb24.cols = Resolution::get().cols(); depth_device.upload(depth.data, depth.step, depth.rows, depth.cols); colors_device.upload(rgb24.data, rgb24.step, rgb24.rows, rgb24.cols); TICK("processFrame"); frontend->processFrame(depth_device, colors_device, logRead->decompressedImage, logRead->decompressedDepth, logRead->timestamp, logRead->isCompressed, logRead->compressedDepth, logRead->compressedDepthSize, logRead->compressedImage, logRead->compressedImageSize); TOCK("processFrame"); uint64_t duration = Stopwatch::getCurrentSystemTime() - start; if(threadPack.limit.getValue() && duration < 33333) { int sleepTime = std::max(int(33333 - duration), 0); usleep(sleepTime); } TOCK(threadIdentifier); } return true; }
void pcl::gpu::printCudaDeviceInfo(int device) { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); int beg = valid ? device : 0; int end = valid ? device+1 : count; printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); printf("Device count: %d\n", count); int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); const char *computeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); int sm_cores = convertSMVer2Cores(prop.major, prop.minor); printf("\nDevice %d: \"%s\"\n", dev, prop.name); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount); printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); #if (CUDART_VERSION >= 4000) // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock, memBusWidth, L2CacheSize; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); printf(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); printf(" Memory Bus Width: %d-bit\n", memBusWidth); if (L2CacheSize) printf(" L2 Cache Size: %d bytes\n", L2CacheSize); printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); #endif printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); printf(" Warp size: %d\n", prop.warpSize); printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); #if CUDART_VERSION >= 4000 printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n", prop.deviceOverlap ? "Yes" : "No"); #endif printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); #if CUDART_VERSION >= 4000 printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); #endif printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } printf("\n"); printf("deviceQuery, CUDA Driver = CUDART"); printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); printf(", NumDevs = %d\n\n", count); fflush(stdout); }
void vm::scanner::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg) { create(rows_arg, colsBytes_arg); cudaSafeCall( cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); }
void pcl::gpu::setDevice(int device) { cudaSafeCall( cudaSetDevice( device ) ); }
void vm::scanner::cuda::DeviceMemory::download(void *host_ptr_arg) const { cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); }
void ParticleSystem::slotResetParticles() { qDebug() << __PRETTY_FUNCTION__; if(!mIsInitialized) slotInitialize(); // When re-setting particles, also reset their position of last collision! cudaSafeCall(cudaMemset(mDeviceParticleCollisionPositions, 0, mParametersSimulation->particleCount * 4 * sizeof(float))); switch(mDefaultParticlePlacement) { case ParticlePlacement::PlacementRandom: { int p = 0, v = 0; qDebug() << __PRETTY_FUNCTION__ << "world min" << mParametersSimulation->gridParticleSystem.worldMin.x << mParametersSimulation->gridParticleSystem.worldMin.y << mParametersSimulation->gridParticleSystem.worldMin.z << "max" << mParametersSimulation->gridParticleSystem.worldMax.x << mParametersSimulation->gridParticleSystem.worldMax.y << mParametersSimulation->gridParticleSystem.worldMax.z; for(unsigned int i=0; i < mParametersSimulation->particleCount; i++) { mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.x + (mParametersSimulation->gridParticleSystem.worldMax.x - mParametersSimulation->gridParticleSystem.worldMin.x) * frand(); mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.y + (mParametersSimulation->gridParticleSystem.worldMax.y - mParametersSimulation->gridParticleSystem.worldMin.y) * frand(); mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.z + (mParametersSimulation->gridParticleSystem.worldMax.z - mParametersSimulation->gridParticleSystem.worldMin.z) * frand(); mHostParticlePos[p++] = 1.0f; mHostParticleVel[v++] = 0.0f; mHostParticleVel[v++] = 0.0f; mHostParticleVel[v++] = 0.0f; mHostParticleVel[v++] = 0.0f; } break; } case ParticlePlacement::PlacementGrid: { const float jitter = mParametersSimulation->particleRadius * 0.01f; const float spacing = mParametersSimulation->particleRadius * 2.0f; // If we want a cube, determine the number of particles in each dimension unsigned int s = (int) ceilf(powf((float) mParametersSimulation->particleCount, 1.0f / 3.0f)); unsigned int gridSize[3]; gridSize[0] = gridSize[1] = gridSize[2] = s; srand(1973); for(unsigned int z=0; z<gridSize[2]; z++) { for(unsigned int y=0; y<gridSize[1]; y++) { for(unsigned int x=0; x<gridSize[0]; x++) { unsigned int i = (z*gridSize[1]*gridSize[0]) + (y*gridSize[0]) + x; if (i < mParametersSimulation->particleCount) { mHostParticlePos[i*4+0] = (spacing * x) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter; mHostParticlePos[i*4+1] = (spacing * y) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter; mHostParticlePos[i*4+2] = (spacing * z) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter; mHostParticlePos[i*4+3] = 1.0f; mHostParticleVel[i*4+0] = 0.0f; mHostParticleVel[i*4+1] = 0.0f; mHostParticleVel[i*4+2] = 0.0f; mHostParticleVel[i*4+3] = 0.0f; } } } } break; } case ParticlePlacement::PlacementFillSky: { float jitter = mParametersSimulation->particleRadius * 0.1f; const float spacing = mParametersSimulation->particleRadius * 2.02f; unsigned int particleNumber = 0; for( float y = mParametersSimulation->gridParticleSystem.worldMax.y - mParametersSimulation->particleRadius; y >= mParametersSimulation->gridParticleSystem.worldMin.y + mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount; y -= spacing) { for( float x = mParametersSimulation->gridParticleSystem.worldMin.x + mParametersSimulation->particleRadius; x <= mParametersSimulation->gridParticleSystem.worldMax.x - mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount; x += spacing) { for( float z = mParametersSimulation->gridParticleSystem.worldMin.z + mParametersSimulation->particleRadius; z <= mParametersSimulation->gridParticleSystem.worldMax.z - mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount; z += spacing) { // qDebug() << "moving particle" << particleNumber << "to" << x << y << z; mHostParticlePos[particleNumber*4+0] = x + (frand()-0.5) * jitter; mHostParticlePos[particleNumber*4+1] = y + (frand()-0.5) * jitter; mHostParticlePos[particleNumber*4+2] = z + (frand()-0.5) * jitter; mHostParticlePos[particleNumber*4+3] = 1.0f; mHostParticleVel[particleNumber*4+0] = 0.0f; mHostParticleVel[particleNumber*4+1] = 0.0f; mHostParticleVel[particleNumber*4+2] = 0.0f; mHostParticleVel[particleNumber*4+3] = 0.0f; particleNumber++; } } } break; } } setArray(ArrayPositions, mHostParticlePos, 0, mParametersSimulation->particleCount); setArray(ArrayVelocities, mHostParticleVel, 0, mParametersSimulation->particleCount); }
bool MainController::setup() { pcl::console::setVerbosityLevel(pcl::console::L_ALWAYS); Volume::get(ConfigArgs::get().volumeSize); Stopwatch::get().setCustomSignature(43543534); cudaSafeCall(cudaSetDevice(ConfigArgs::get().gpu)); loadCalibration(); std::cout << "Point resolution: " << ((int)((Volume::get().getVoxelSizeMeters().x * 1000.0f) * 10.0f)) / 10.0f << " millimetres" << std::endl; if(ConfigArgs::get().logFile.size()) { rawRead = new RawLogReader; logRead = static_cast<LogReader *>(rawRead); } else { liveRead = new LiveLogReader; logRead = static_cast<LogReader *>(liveRead); } ThreadDataPack::get(); trackerInterface = new TrackerInterface(logRead, depthIntrinsics); if(ConfigArgs::get().trajectoryFile.size()) { std::cout << "Load trajectory: " << ConfigArgs::get().trajectoryFile << std::endl; trackerInterface->loadTrajectory(ConfigArgs::get().trajectoryFile); } systemComponents.push_back(trackerInterface); ThreadDataPack::get().assignFrontend(trackerInterface->getFrontend()); cloudSliceProcessor = new CloudSliceProcessor(); systemComponents.push_back(cloudSliceProcessor); if(ConfigArgs::get().extractOverlap) { trackerInterface->enableOverlap(); } if(!ConfigArgs::get().incrementalMesh && ConfigArgs::get().enableMeshGenerator) { meshGenerator = new MeshGenerator(); systemComponents.push_back(meshGenerator); } else { ThreadDataPack::get().meshGeneratorFinished.assignValue(true); } if(ConfigArgs::get().vocabFile.size() && ConfigArgs::get().onlineDeformation) { deformation = new Deformation; placeRecognition = new PlaceRecognition(depthIntrinsics); systemComponents.push_back(deformation); systemComponents.push_back(placeRecognition); } else { ThreadDataPack::get().deformationFinished.assignValue(true); ThreadDataPack::get().placeRecognitionFinished.assignValue(true); } pangoVis = new PangoVis(depthIntrinsics); return true; }
void ParticleSystem::freeResources() { Q_ASSERT(mIsInitialized); qDebug() << __PRETTY_FUNCTION__ << "freeing allocated memory..."; delete [] mHostParticlePos; delete [] mHostParticleVel; cudaSafeCall(cudaFree(mDeviceColliderSortedPos)); cudaSafeCall(cudaFree(mDeviceParticleVel)); cudaSafeCall(cudaFree(mDeviceParticleSortedPos)); cudaSafeCall(cudaFree(mDeviceParticleSortedVel)); cudaSafeCall(cudaFree(mDeviceParticleMapGridCell)); cudaSafeCall(cudaFree(mDeviceParticleMapIndex)); cudaSafeCall(cudaFree(mDeviceParticleCellStart)); cudaSafeCall(cudaFree(mDeviceParticleCellEnd)); cudaSafeCall(cudaFree(mDeviceColliderMapGridCell)); cudaSafeCall(cudaFree(mDeviceColliderMapIndex)); cudaSafeCall(cudaFree(mDeviceColliderCellStart)); cudaSafeCall(cudaFree(mDeviceColliderCellEnd)); cudaSafeCall(cudaFree(mDeviceParticleCollisionPositions)); cudaSafeCall(cudaGraphicsUnregisterResource(mCudaVboResourceParticlePositions)); OpenGlUtilities::deleteVbo(mVboParticlePositions); setNullPointers(); // mVboParticlePositions should now be zero, emit it and glScene won't render it slotEmitVboInfoAndParameters(); mIsInitialized = false; qDebug() << __PRETTY_FUNCTION__ << "done."; }
void ParticleSystem::slotInitialize() { qDebug() << __PRETTY_FUNCTION__; if(mIsInitialized) { qDebug() << __PRETTY_FUNCTION__ << "already initialized, returning."; return; } OpenGlUtilities::checkError(); // This needs to be called only once, but here it might get called more often. Shouldn't be a problem. initializeOpenGLFunctions(); size_t memTotal, memFree; cudaSafeCall(cudaMemGetInfo(&memFree, &memTotal)); qDebug() << __PRETTY_FUNCTION__ << "before init, device has" << memFree / 1048576 << "of" << memTotal / 1048576 << "mb free."; mUpdateMappingFromColliderToGridCell = true; // Change particleCount, so that it makes sense to use with particleRadius: we want the top third to be populated with particles. float volumeTotal = mParametersSimulation->gridParticleSystem.getWorldVolume(); // Abstract particles to boxes, as that's how they'll be arranged float volumeParticle = pow(mParametersSimulation->particleRadius*2, 3); // Use as many particles as needed, but never more than, say, 32k. 64k might also work, but we don't need the density for a 32m^3 scanVolume. mParametersSimulation->particleCount = (volumeTotal / volumeParticle) / 3; if(mParametersSimulation->particleCount > 32768) mParametersSimulation->particleCount = 32768; // Set gridsize so that the cell-edges are never shorter than the particle's diameter! If particles were allowed to be larger than gridcells in // any dimension, we couldn't find all relevant neighbors by searching through only the (3*3*3)-1 = 26 cells immediately neighboring this one. // We can also modify this compromise by allowing larger particles and then searching (5*5*5)-1 = 124 cells. Haven't tried. // Using less (larger) cells is possible, it means less memory being used for the grid, but more search being done when searching for neighbors // because more particles now occupy a single grid cell. This might not hurt performance as long as we do not cross an unknown threshold (kernel swap size)?! const QVector3D particleSystemWorldSize = CudaHelper::convert(mParametersSimulation->gridParticleSystem.getWorldSize()); mParametersSimulation->gridParticleSystem.cells.x = nextHigherPowerOfTwo(particleSystemWorldSize.x() / mParametersSimulation->particleRadius) / 2; mParametersSimulation->gridParticleSystem.cells.x = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.x, 128); mParametersSimulation->gridParticleSystem.cells.y = nextHigherPowerOfTwo(particleSystemWorldSize.y() / mParametersSimulation->particleRadius) / 2; mParametersSimulation->gridParticleSystem.cells.y = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.y, 128); mParametersSimulation->gridParticleSystem.cells.z = nextHigherPowerOfTwo(particleSystemWorldSize.z() / mParametersSimulation->particleRadius) / 2; mParametersSimulation->gridParticleSystem.cells.z = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.z, 128); const quint32 numberOfCells = mParametersSimulation->gridParticleSystem.getCellCount(); // allocate host storage for particle positions and velocities, then set them to zero mHostParticlePos = new float[mParametersSimulation->particleCount * 4]; mHostParticleVel = new float[mParametersSimulation->particleCount * 4]; mNumberOfBytesAllocatedCpu += mParametersSimulation->particleCount * 8; memset(mHostParticlePos, 0, mParametersSimulation->particleCount * 4 * sizeof(float)); memset(mHostParticleVel, 0, mParametersSimulation->particleCount * 4 * sizeof(float)); // determine GPU data-size const quint32 memSizeParticleQuadrupels = sizeof(float) * 4 * mParametersSimulation->particleCount; // Allocate GPU data // Create VBO with particle positions. This is later given to particle renderer for visualization mVboParticlePositions = OpenGlUtilities::createVbo(memSizeParticleQuadrupels); mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels; cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceParticlePositions, mVboParticlePositions, cudaGraphicsMapFlagsNone)); // Create VBO with collider positions. This is later given to particle renderer for visualization mVboColliderPositions = mPointCloudColliders->getRenderInfo()->at(0)->vbo;//OpenGlUtilities::createVbo(sizeof(float) * 4 * mSimulationParameters->colliderCountMax); //qDebug() << "vbo colliderpos is" << mVboColliderPositions; cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceColliderPositions, mVboColliderPositions, cudaGraphicsMapFlagsNone)); // use vboInfo.size or mSimulationParameters->colliderCountMax? cudaSafeCall(cudaMalloc((void**)&mDeviceParticleVel, memSizeParticleQuadrupels)); mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels; cudaSafeCall(cudaMalloc((void**)&mDeviceColliderSortedPos, sizeof(float) * 4 * mPointCloudColliders->getCapacity())); mNumberOfBytesAllocatedGpu += sizeof(float) * 4 * mPointCloudColliders->getCapacity(); // Here, we store the positions of each particle's last collision with the colliders cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCollisionPositions, sizeof(float) * 4 * mParametersSimulation->particleCount)); cudaSafeCall(cudaMemset(mDeviceParticleCollisionPositions, 0, sizeof(float) * 4 * mParametersSimulation->particleCount)); // set to (float)-zero mNumberOfBytesAllocatedGpu += sizeof(float) * 4 * mParametersSimulation->particleCount; // Sorted according to containing grid cell. cudaSafeCall(cudaMalloc((void**)&mDeviceParticleSortedPos, memSizeParticleQuadrupels)); mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels; cudaSafeCall(cudaMalloc((void**)&mDeviceParticleSortedVel, memSizeParticleQuadrupels)); mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels; // These two are used to map from gridcell (=hash) to particle id (=index). If we also know in which // indices of these arrays grid cells start and end, we can quickly find particles in neighboring cells... cudaSafeCall(cudaMalloc((void**)&mDeviceParticleMapGridCell, mParametersSimulation->particleCount*sizeof(uint))); cudaSafeCall(cudaMalloc((void**)&mDeviceParticleMapIndex, mParametersSimulation->particleCount*sizeof(uint))); mNumberOfBytesAllocatedGpu += mParametersSimulation->particleCount * sizeof(uint) * 2; // Same thing as above, just for colliders cudaSafeCall(cudaMalloc((void**)&mDeviceColliderMapGridCell, mPointCloudColliders->getCapacity()*sizeof(uint))); cudaSafeCall(cudaMalloc((void**)&mDeviceColliderMapIndex, mPointCloudColliders->getCapacity()*sizeof(uint))); mNumberOfBytesAllocatedGpu += mPointCloudColliders->getCapacity() * sizeof(uint) * 2; // ...and thats what we do here: in mDeviceCellStart[17], you'll find // where in mDeviceGridParticleHash cell 17 starts! cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCellStart, numberOfCells*sizeof(uint))); cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCellEnd, numberOfCells*sizeof(uint))); mNumberOfBytesAllocatedGpu += numberOfCells * sizeof(uint) * 2; // Same for colliders... cudaSafeCall(cudaMalloc((void**)&mDeviceColliderCellStart, numberOfCells*sizeof(uint))); cudaSafeCall(cudaMalloc((void**)&mDeviceColliderCellEnd, numberOfCells*sizeof(uint))); mNumberOfBytesAllocatedGpu += numberOfCells * sizeof(uint) * 2; qDebug() << __PRETTY_FUNCTION__ << "worldsize" << CudaHelper::convert(mParametersSimulation->gridParticleSystem.getWorldSize()) << "and particle radius" << mParametersSimulation->particleRadius << ": created system with" << mParametersSimulation->particleCount << "particles and" << mParametersSimulation->gridParticleSystem.cells.x << "*" << mParametersSimulation->gridParticleSystem.cells.y << "*" << mParametersSimulation->gridParticleSystem.cells.z << "cells"; qDebug() << __PRETTY_FUNCTION__ << "allocated" << mNumberOfBytesAllocatedCpu << "bytes on CPU," << mNumberOfBytesAllocatedGpu << "bytes on GPU."; copyParametersToGpu(mParametersSimulation); mIsInitialized = true; slotResetParticles(); slotEmitVboInfoAndParameters(); cudaSafeCall(cudaMemGetInfo(&memFree, &memTotal)); qDebug() << __PRETTY_FUNCTION__ << "after init, device has" << memFree / 1048576 << "of" << memTotal / 1048576 << "mb free."; }
void kf::cuda::waitAllDefaultStream() { cudaSafeCall(cudaDeviceSynchronize() ); }
template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img) { cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); }
void vm::scanner::cuda::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) { create(sizeBytes_arg); cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); }
TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex) { cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) ); }
~TextureBinder() { cudaSafeCall( cudaUnbindTexture(texref) ); }
TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex) { cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) ); }
TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex) { cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) ); }
void vm::scanner::cuda::waitAllDefaultStream() { cudaSafeCall(cudaDeviceSynchronize() ); }
LabelImage RandomForestImage::predict(const RGBDImage& image, cuv::ndarray<float, cuv::host_memory_space>* probabilities, const bool onGPU, bool useDepthImages) const { LabelImage prediction(image.getWidth(), image.getHeight()); const LabelType numClasses = getNumClasses(); if (treeData.size() != ensemble.size()) { throw std::runtime_error((boost::format("tree data size: %d, ensemble size: %d. histograms normalized?") % treeData.size() % ensemble.size()).str()); } cuv::ndarray<float, cuv::host_memory_space> hostProbabilities( cuv::extents[numClasses][image.getHeight()][image.getWidth()], m_predictionAllocator); if (onGPU) { cuv::ndarray<float, cuv::dev_memory_space> deviceProbabilities( cuv::extents[numClasses][image.getHeight()][image.getWidth()], m_predictionAllocator); cudaSafeCall(cudaMemset(deviceProbabilities.ptr(), 0, static_cast<size_t>(deviceProbabilities.size() * sizeof(float)))); { utils::Profile profile("classifyImagesGPU"); for (const boost::shared_ptr<const TreeNodes>& data : treeData) { classifyImage(treeData.size(), deviceProbabilities, image, numClasses, data, useDepthImages); } } normalizeProbabilities(deviceProbabilities); cuv::ndarray<LabelType, cuv::dev_memory_space> output(image.getHeight(), image.getWidth(), m_predictionAllocator); determineMaxProbabilities(deviceProbabilities, output); hostProbabilities = deviceProbabilities; cuv::ndarray<LabelType, cuv::host_memory_space> outputHost(image.getHeight(), image.getWidth(), m_predictionAllocator); outputHost = output; { utils::Profile profile("setLabels"); for (int y = 0; y < image.getHeight(); ++y) { for (int x = 0; x < image.getWidth(); ++x) { prediction.setLabel(x, y, static_cast<LabelType>(outputHost(y, x))); } } } } else { utils::Profile profile("classifyImagesCPU"); tbb::parallel_for(tbb::blocked_range<size_t>(0, image.getHeight()), [&](const tbb::blocked_range<size_t>& range) { for(size_t y = range.begin(); y != range.end(); y++) { for(int x=0; x < image.getWidth(); x++) { for (LabelType label = 0; label < numClasses; label++) { hostProbabilities(label, y, x) = 0.0f; } for (const auto& tree : ensemble) { const auto& t = tree->getTree(); PixelInstance pixel(&image, 0, x, y); const auto& hist = t->classifySoft(pixel); assert(hist.size() == numClasses); for(LabelType label = 0; label<hist.size(); label++) { hostProbabilities(label, y, x) += hist[label]; } } double sum = 0.0f; for (LabelType label = 0; label < numClasses; label++) { sum += hostProbabilities(label, y, x); } float bestProb = -1.0f; for (LabelType label = 0; label < numClasses; label++) { hostProbabilities(label, y, x) /= sum; float prob = hostProbabilities(label, y, x); if (prob > bestProb) { prediction.setLabel(x, y, label); bestProb = prob; } } } } }); } if (probabilities) { *probabilities = hostProbabilities; } return prediction; }
LabelImage RandomForestImage::improveHistograms(const RGBDImage& image, const LabelImage& labelImage, const bool onGPU, bool useDepthImages) const { LabelImage prediction(image.getWidth(), image.getHeight()); const LabelType numClasses = getNumClasses(); if (treeData.size() != ensemble.size()) { throw std::runtime_error((boost::format("tree data size: %d, ensemble size: %d. histograms normalized?") % treeData.size() % ensemble.size()).str()); } cuv::ndarray<float, cuv::host_memory_space> hostProbabilities( cuv::extents[numClasses][image.getHeight()][image.getWidth()], m_predictionAllocator); //These offsets should have been used instead of traversing to the leaf again /* cuv::ndarray<unsigned int, cuv::dev_memory_space> nodeOffsets( cuv::extents[image.getHeight()][image.getWidth()], m_predictionAllocator); */ if (onGPU) { cuv::ndarray<float, cuv::dev_memory_space> deviceProbabilities( cuv::extents[numClasses][image.getHeight()][image.getWidth()], m_predictionAllocator); cudaSafeCall(cudaMemset(deviceProbabilities.ptr(), 0, static_cast<size_t>(deviceProbabilities.size() * sizeof(float)))); { utils::Profile profile("classifyImagesGPU"); for (const boost::shared_ptr<const TreeNodes>& data : treeData) { classifyImage(treeData.size(), deviceProbabilities, image, numClasses, data, useDepthImages); bool found_tree = false; //should be change to parallel for and add lock for (size_t treeNr = 0; treeNr < ensemble.size(); treeNr++) { if (data->getTreeId() == ensemble[treeNr]->getId()) { found_tree =true; const boost::shared_ptr<RandomTree<PixelInstance, ImageFeatureFunction> >& tree = ensemble[treeNr]->getTree(); //this should have been used and done before trying to classify the images, since it doesn't change //std::vector<size_t> leafSet; //tree->collectLeafNodes(leafSet); for (int y = 0; y < image.getHeight(); y++) for (int x = 0; x < image.getWidth(); x++) { LabelType label = labelImage.getLabel(x,y); if (!shouldIgnoreLabel(label)) { PixelInstance pixel(&image, label, x, y); //This should be changed. When classifying the image, the nodeoffsets should be returned and those used directly //instead of traversing again to the leaves. As a test, can check if the nodeoffset is the same as the one returned //by travertoleaf tree->setAllPixelsHistogram(pixel); } } } if (found_tree) break; } } } } //should also add the CPU code! return prediction; }