예제 #1
0
void pcl::gpu::DeviceMemory2D::release()
{
    if( refcount_ && CV_XADD(refcount_, -1) == 1 )
    {
        //cv::fastFree(refcount);
        delete refcount_;
        cudaSafeCall( cudaFree(data_) );
    }

    colsBytes_ = 0;
    rows_ = 0;
    data_ = 0;
    step_ = 0;
    refcount_ = 0;
}
	void CGLUtil::setCudaDeviceForGLInteroperation() {
		cudaDeviceProp  sProp;
		memset( &sProp, 0, sizeof( cudaDeviceProp ) );
		sProp.major = 1;
		sProp.minor = 0;
		int nDev;
		cudaSafeCall( cudaChooseDevice( &nDev, &sProp ) );
		// tell CUDA which nDev we will be using for graphic interop
		// from the programming guide:  Interoperability with OpenGL
		//     requires that the CUDA nDeviceNO_ be specified by
		//     cudaGLSetGLDevice() before any other runtime calls.
		//cudaSafeCall( cudaGLSetGLDevice( nDev ) ;

		return;
	}//setCudaDeviceForGLInteroperation()
	void CGLUtil::printShortCudaDeviceInfo(int nDeviceNO_) 
	{
		int nDeviceCount = getCudaEnabledDeviceCount();
		bool valid = (nDeviceNO_ >= 0) && (nDeviceNO_ < nDeviceCount);

		int beg = valid ? nDeviceNO_   : 0;
		int end = valid ? nDeviceNO_+1 : nDeviceCount;

		int driverVersion = 0, runtimeVersion = 0;
		cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
		cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );

		for(int dev = beg; dev < end; ++dev)
		{                
			cudaDeviceProp prop;
			cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );

			const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : "";
			printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);                
			printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, /*convertSMVer2Cores(prop.major, prop.minor) **/ prop.multiProcessorCount);                
			printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
		}
		fflush(stdout);
	}
예제 #4
0
void vm::scanner::cuda::DeviceMemory::create(size_t sizeBytes_arg)
{
    if (sizeBytes_arg == sizeBytes_)
        return;
            
    if( sizeBytes_arg > 0)
    {        
        if( data_ )
            release();

        sizeBytes_ = sizeBytes_arg;
                        
        cudaSafeCall( cudaMalloc(&data_, sizeBytes_) );
        
        //refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
예제 #5
0
void vm::scanner::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg)
{
    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
        return;
            
    if( rows_arg > 0 && colsBytes_arg > 0)
    {        
        if( data_ )
            release();
              
        colsBytes_ = colsBytes_arg;
        rows_ = rows_arg;
                        
        cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) );        

        //refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
예제 #6
0
  Mat6f get(Vec6f& b)
  {
    cudaSafeCall( cudaStreamSynchronize(stream) );

    Mat6f A;
    float *data_A = A.val;
    float *data_b = b.val;

    int shift = 0;
    for (int i = 0; i < 6; ++i)   //rows
      for (int j = i; j < 7; ++j) // cols + b
      {
        float value = locked_buffer.data[shift++];
        if (j == 6)               // vector b
          data_b[i] = value;
        else
          data_A[j * 6 + i] = data_A[i * 6 + j] = value;
      }
    return A;
  }
예제 #7
0
void pcl::gpu::DeviceMemory::create(size_t sizeBytes_arg)
{
    if (sizeBytes_arg == sizeBytes_)
        return;

    if( sizeBytes_arg > 0)
    {
        if( data_ )
            release();

        sizeBytes_ = sizeBytes_arg;
        printf( "[CUDA] Allocating memory %d bytes.\n", sizeBytes_ );

        cudaSafeCall( cudaMalloc(&data_, sizeBytes_) );

        //refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
예제 #8
0
void pcl::gpu::DeviceMemory2D::create(int rows_arg, int colsBytes_arg)
{
    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
        return;

    if( rows_arg > 0 && colsBytes_arg > 0)
    {
        if( data_ )
            release();

        colsBytes_ = colsBytes_arg;
        rows_ = rows_arg;
        printf( "[CUDA] Allocating memory %d x %d = %d bytes.\n", colsBytes_, rows_, colsBytes_ * rows_ );

        cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) );

        //refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
예제 #9
0
// Write positions or velocities of the particles into the GPU memory
void ParticleSystem::setArray(ParticleArray array, const float* data, int start, int count)
{
    if(array == ArrayPositions)
    {
        cudaGraphicsUnregisterResource(mCudaVboResourceParticlePositions);

        glBindBuffer(GL_ARRAY_BUFFER, mVboParticlePositions);
        glBufferSubData(GL_ARRAY_BUFFER, start*4*sizeof(float), count*4*sizeof(float), data);
        glBindBuffer(GL_ARRAY_BUFFER, 0);

        cudaGraphicsGLRegisterBuffer(&mCudaVboResourceParticlePositions, mVboParticlePositions, cudaGraphicsMapFlagsNone);
    }
    else if(array == ArrayVelocities)
    {
        cudaSafeCall(cudaMemcpy(
                    (char*) mDeviceParticleVel + start*4*sizeof(float), // destination
                    data,                                       // source
                    count*4*sizeof(float),                      // count
                    cudaMemcpyHostToDevice                      // copy-kind
                    ));
    }
}
	GLuint CGLUtil::gpuMapRgb2PixelBufferObj(const cv::cuda::GpuMat& cvgmRGB_ ){
		//http://rickarkin.blogspot.co.uk/2012/03/use-pbo-to-share-buffer-between-cuda.html
		int nPyrLevel_ = getLevel( cvgmRGB_.cols );
		GLuint uTexture;
		// map OpenGL buffer object for writing from CUDA
		if (cvgmRGB_.channels() == 3) {
			uTexture = _auTexture[nPyrLevel_];
			void *pDev;
			cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0)); 
			size_t nSize; 
			cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceRGBPxielBO[nPyrLevel_]));
			cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC3, pDev); 
			cvgmRGB_.copyTo(cvgmRGBA); 
			cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0) );
			//texture mapping
			glBindTexture( GL_TEXTURE_2D, uTexture);
			glBindBuffer ( GL_PIXEL_UNPACK_BUFFER_ARB, _auRGBPixelBO[nPyrLevel_]);
			glTexImage2D( GL_TEXTURE_2D, 0, GL_RGB, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
			errorDetectorGL();
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
			glBindTexture(GL_TEXTURE_2D, 0);
		}
		else if (cvgmRGB_.channels()==1) {
			uTexture = _auGrayTexture[nPyrLevel_];
			void *pDev;
			cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0)); 
			size_t nSize; 
			cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceGrayPxielBO[nPyrLevel_]));
			cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC1, pDev);
			cvgmRGB_.copyTo(cvgmRGBA); 
			cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0) );
			//texture mapping
			glBindTexture(GL_TEXTURE_2D, uTexture);
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, _auGrayPixelBO[nPyrLevel_]);
			glTexImage2D(GL_TEXTURE_2D, 0, GL_RED, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
			errorDetectorGL();
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
			glBindTexture(GL_TEXTURE_2D, 0);
		}
		return uTexture;
	}//gpuMapRgb2PixelBufferObj
예제 #11
0
void vm::scanner::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const
{    
    cudaSafeCall( cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}      
예제 #12
0
bool inline TrackerInterface::process()
{
    if(firstRun)
    {
        cudaSafeCall(cudaSetDevice(ConfigArgs::get().gpu));
        firstRun = false;
    }

    if(!threadPack.pauseCapture.getValue())
    {
        TICK(threadIdentifier);

        uint64_t start = Stopwatch::getCurrentSystemTime();

        bool returnVal = true;

        bool shouldEnd = endRequested.getValue();

        if(!logRead->grabNext(returnVal, currentFrame) || shouldEnd)
        {
            threadPack.pauseCapture.assignValue(true);
            threadPack.finalised.assignValue(true);

            finalise();

            while(!threadPack.cloudSliceProcessorFinished.getValueWait())
            {
                frontend->cloudSignal.notify_all();
            }

            return shouldEnd ? false : returnVal;
        }

        depth.data = (unsigned short *)logRead->decompressedDepth;
        rgb24.data = (PixelRGB *)logRead->decompressedImage;
        
        currentFrame++;

        depth.step = Resolution::get().width() * 2;
        depth.rows = Resolution::get().rows();
        depth.cols = Resolution::get().cols();

        rgb24.step = Resolution::get().width() * 3;
        rgb24.rows = Resolution::get().rows();
        rgb24.cols = Resolution::get().cols();

        depth_device.upload(depth.data, depth.step, depth.rows, depth.cols);
        colors_device.upload(rgb24.data, rgb24.step, rgb24.rows, rgb24.cols);

        TICK("processFrame");
        frontend->processFrame(depth_device,
                               colors_device,
                               logRead->decompressedImage,
                               logRead->decompressedDepth,
                               logRead->timestamp,
                               logRead->isCompressed,
                               logRead->compressedDepth,
                               logRead->compressedDepthSize,
                               logRead->compressedImage,
                               logRead->compressedImageSize);
        TOCK("processFrame");

        uint64_t duration = Stopwatch::getCurrentSystemTime() - start;

        if(threadPack.limit.getValue() && duration < 33333)
        {
            int sleepTime = std::max(int(33333 - duration), 0);
            usleep(sleepTime);
        }

        TOCK(threadIdentifier);
    }
    
    return true;
}
예제 #13
0
void pcl::gpu::printCudaDeviceInfo(int device)
{
    int count = getCudaEnabledDeviceCount();
    bool valid = (device >= 0) && (device < count);

    int beg = valid ? device   : 0;
    int end = valid ? device+1 : count;

    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
    printf("Device count: %d\n", count);

    int driverVersion = 0, runtimeVersion = 0;
    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );

    const char *computeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
        "Unknown",
        NULL
    };

    for(int dev = beg; dev < end; ++dev)
    {                
        cudaDeviceProp prop;
        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );

        int sm_cores = convertSMVer2Cores(prop.major, prop.minor);

        printf("\nDevice %d: \"%s\"\n", dev, prop.name);        
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);        
        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);            
        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount);
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);

#if (CUDART_VERSION >= 4000)
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock, memBusWidth, L2CacheSize;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );        
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );                
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );

        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        if (L2CacheSize)
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        
        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
#endif
        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
        printf("  Warp size:                                     %d\n", prop.warpSize);
        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);

#if CUDART_VERSION >= 4000
        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
#else
        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
#endif
        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");

        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
#if CUDART_VERSION >= 4000
        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
#endif
        printf("  Compute Mode:\n");
        printf("      %s \n", computeMode[prop.computeMode]);
    }
    
    printf("\n");    
    printf("deviceQuery, CUDA Driver = CUDART");
    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
    printf(", NumDevs = %d\n\n", count);                
    fflush(stdout);
}
예제 #14
0
void vm::scanner::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg)
{
    create(rows_arg, colsBytes_arg);
    cudaSafeCall( cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice) );        
    cudaSafeCall( cudaDeviceSynchronize() );
}
예제 #15
0
void pcl::gpu::setDevice(int device)
{
    cudaSafeCall( cudaSetDevice( device ) );
}
예제 #16
0
void vm::scanner::cuda::DeviceMemory::download(void *host_ptr_arg) const
{    
    cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}          
예제 #17
0
void ParticleSystem::slotResetParticles()
{
    qDebug() << __PRETTY_FUNCTION__;

    if(!mIsInitialized) slotInitialize();

    // When re-setting particles, also reset their position of last collision!
    cudaSafeCall(cudaMemset(mDeviceParticleCollisionPositions, 0, mParametersSimulation->particleCount * 4 * sizeof(float)));

    switch(mDefaultParticlePlacement)
    {
    case ParticlePlacement::PlacementRandom:
    {
        int p = 0, v = 0;

        qDebug() << __PRETTY_FUNCTION__ << "world min" << mParametersSimulation->gridParticleSystem.worldMin.x << mParametersSimulation->gridParticleSystem.worldMin.y << mParametersSimulation->gridParticleSystem.worldMin.z <<
                    "max" << mParametersSimulation->gridParticleSystem.worldMax.x << mParametersSimulation->gridParticleSystem.worldMax.y << mParametersSimulation->gridParticleSystem.worldMax.z;

        for(unsigned int i=0; i < mParametersSimulation->particleCount; i++)
        {
            mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.x + (mParametersSimulation->gridParticleSystem.worldMax.x - mParametersSimulation->gridParticleSystem.worldMin.x) * frand();
            mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.y + (mParametersSimulation->gridParticleSystem.worldMax.y - mParametersSimulation->gridParticleSystem.worldMin.y) * frand();
            mHostParticlePos[p++] = mParametersSimulation->gridParticleSystem.worldMin.z + (mParametersSimulation->gridParticleSystem.worldMax.z - mParametersSimulation->gridParticleSystem.worldMin.z) * frand();
            mHostParticlePos[p++] = 1.0f;

            mHostParticleVel[v++] = 0.0f;
            mHostParticleVel[v++] = 0.0f;
            mHostParticleVel[v++] = 0.0f;
            mHostParticleVel[v++] = 0.0f;
        }
        break;
    }

    case ParticlePlacement::PlacementGrid:
    {
        const float jitter = mParametersSimulation->particleRadius * 0.01f;
        const float spacing = mParametersSimulation->particleRadius * 2.0f;

        // If we want a cube, determine the number of particles in each dimension
        unsigned int s = (int) ceilf(powf((float) mParametersSimulation->particleCount, 1.0f / 3.0f));
        unsigned int gridSize[3];
        gridSize[0] = gridSize[1] = gridSize[2] = s;

        srand(1973);
        for(unsigned int z=0; z<gridSize[2]; z++)
        {
            for(unsigned int y=0; y<gridSize[1]; y++)
            {
                for(unsigned int x=0; x<gridSize[0]; x++)
                {
                    unsigned int i = (z*gridSize[1]*gridSize[0]) + (y*gridSize[0]) + x;
                    if (i < mParametersSimulation->particleCount)
                    {
                        mHostParticlePos[i*4+0] = (spacing * x) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter;
                        mHostParticlePos[i*4+1] = (spacing * y) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter;
                        mHostParticlePos[i*4+2] = (spacing * z) + mParametersSimulation->particleRadius - 1.0f + (frand()*2.0f-1.0f)*jitter;
                        mHostParticlePos[i*4+3] = 1.0f;

                        mHostParticleVel[i*4+0] = 0.0f;
                        mHostParticleVel[i*4+1] = 0.0f;
                        mHostParticleVel[i*4+2] = 0.0f;
                        mHostParticleVel[i*4+3] = 0.0f;
                    }
                }
            }
        }
        break;
    }

    case ParticlePlacement::PlacementFillSky:
    {
        float jitter = mParametersSimulation->particleRadius * 0.1f;
        const float spacing = mParametersSimulation->particleRadius * 2.02f;

        unsigned int particleNumber = 0;

        for(
            float y = mParametersSimulation->gridParticleSystem.worldMax.y - mParametersSimulation->particleRadius;
            y >= mParametersSimulation->gridParticleSystem.worldMin.y + mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount;
            y -= spacing)
        {

            for(
                float x = mParametersSimulation->gridParticleSystem.worldMin.x + mParametersSimulation->particleRadius;
                x <= mParametersSimulation->gridParticleSystem.worldMax.x - mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount;
                x += spacing)
            {
                for(
                    float z = mParametersSimulation->gridParticleSystem.worldMin.z + mParametersSimulation->particleRadius;
                    z <= mParametersSimulation->gridParticleSystem.worldMax.z - mParametersSimulation->particleRadius && particleNumber < mParametersSimulation->particleCount;
                    z += spacing)
                {
                    //                    qDebug() << "moving particle" << particleNumber << "to" << x << y << z;
                    mHostParticlePos[particleNumber*4+0] = x + (frand()-0.5) * jitter;
                    mHostParticlePos[particleNumber*4+1] = y + (frand()-0.5) * jitter;
                    mHostParticlePos[particleNumber*4+2] = z + (frand()-0.5) * jitter;
                    mHostParticlePos[particleNumber*4+3] = 1.0f;

                    mHostParticleVel[particleNumber*4+0] = 0.0f;
                    mHostParticleVel[particleNumber*4+1] = 0.0f;
                    mHostParticleVel[particleNumber*4+2] = 0.0f;
                    mHostParticleVel[particleNumber*4+3] = 0.0f;

                    particleNumber++;
                }
            }
        }
        break;
    }
    }

    setArray(ArrayPositions, mHostParticlePos, 0, mParametersSimulation->particleCount);
    setArray(ArrayVelocities, mHostParticleVel, 0, mParametersSimulation->particleCount);
}
예제 #18
0
bool MainController::setup()
{
    pcl::console::setVerbosityLevel(pcl::console::L_ALWAYS);

    Volume::get(ConfigArgs::get().volumeSize);

    Stopwatch::get().setCustomSignature(43543534);

    cudaSafeCall(cudaSetDevice(ConfigArgs::get().gpu));

    loadCalibration();

    std::cout << "Point resolution: " << ((int)((Volume::get().getVoxelSizeMeters().x * 1000.0f) * 10.0f)) / 10.0f << " millimetres" << std::endl;

    if(ConfigArgs::get().logFile.size())
    {
        rawRead = new RawLogReader;
        logRead = static_cast<LogReader *>(rawRead);
    }
    else
    {
        liveRead = new LiveLogReader;
        logRead = static_cast<LogReader *>(liveRead);
    }

    ThreadDataPack::get();

    trackerInterface = new TrackerInterface(logRead, depthIntrinsics);

    if(ConfigArgs::get().trajectoryFile.size())
    {
        std::cout << "Load trajectory: " << ConfigArgs::get().trajectoryFile << std::endl;
        trackerInterface->loadTrajectory(ConfigArgs::get().trajectoryFile);
    }

    systemComponents.push_back(trackerInterface);

    ThreadDataPack::get().assignFrontend(trackerInterface->getFrontend());

    cloudSliceProcessor = new CloudSliceProcessor();
    systemComponents.push_back(cloudSliceProcessor);

    if(ConfigArgs::get().extractOverlap)
    {
        trackerInterface->enableOverlap();
    }

    if(!ConfigArgs::get().incrementalMesh && ConfigArgs::get().enableMeshGenerator)
    {
        meshGenerator = new MeshGenerator();
        systemComponents.push_back(meshGenerator);
    }
    else
    {
        ThreadDataPack::get().meshGeneratorFinished.assignValue(true);
    }

    if(ConfigArgs::get().vocabFile.size() && ConfigArgs::get().onlineDeformation)
    {
        deformation = new Deformation;
        placeRecognition = new PlaceRecognition(depthIntrinsics);

        systemComponents.push_back(deformation);
        systemComponents.push_back(placeRecognition);
    }
    else
    {
        ThreadDataPack::get().deformationFinished.assignValue(true);
        ThreadDataPack::get().placeRecognitionFinished.assignValue(true);
    }

    pangoVis = new PangoVis(depthIntrinsics);

    return true;
}
예제 #19
0
void ParticleSystem::freeResources()
{
    Q_ASSERT(mIsInitialized);

    qDebug() << __PRETTY_FUNCTION__ << "freeing allocated memory...";

    delete [] mHostParticlePos;
    delete [] mHostParticleVel;

    cudaSafeCall(cudaFree(mDeviceColliderSortedPos));
    cudaSafeCall(cudaFree(mDeviceParticleVel));
    cudaSafeCall(cudaFree(mDeviceParticleSortedPos));
    cudaSafeCall(cudaFree(mDeviceParticleSortedVel));

    cudaSafeCall(cudaFree(mDeviceParticleMapGridCell));
    cudaSafeCall(cudaFree(mDeviceParticleMapIndex));
    cudaSafeCall(cudaFree(mDeviceParticleCellStart));
    cudaSafeCall(cudaFree(mDeviceParticleCellEnd));

    cudaSafeCall(cudaFree(mDeviceColliderMapGridCell));
    cudaSafeCall(cudaFree(mDeviceColliderMapIndex));
    cudaSafeCall(cudaFree(mDeviceColliderCellStart));
    cudaSafeCall(cudaFree(mDeviceColliderCellEnd));

    cudaSafeCall(cudaFree(mDeviceParticleCollisionPositions));

    cudaSafeCall(cudaGraphicsUnregisterResource(mCudaVboResourceParticlePositions));
    OpenGlUtilities::deleteVbo(mVboParticlePositions);

    setNullPointers();

    // mVboParticlePositions should now be zero, emit it and glScene won't render it
    slotEmitVboInfoAndParameters();

    mIsInitialized = false;
    qDebug() << __PRETTY_FUNCTION__ << "done.";
}
예제 #20
0
void ParticleSystem::slotInitialize()
{
    qDebug() << __PRETTY_FUNCTION__;
    if(mIsInitialized)
    {
        qDebug() << __PRETTY_FUNCTION__ << "already initialized, returning.";
        return;
    }

    OpenGlUtilities::checkError();

    // This needs to be called only once, but here it might get called more often. Shouldn't be a problem.
    initializeOpenGLFunctions();

    size_t memTotal, memFree;
    cudaSafeCall(cudaMemGetInfo(&memFree, &memTotal));
    qDebug() << __PRETTY_FUNCTION__ << "before init, device has" << memFree / 1048576 << "of" << memTotal / 1048576 << "mb free.";

    mUpdateMappingFromColliderToGridCell = true;

    // Change particleCount, so that it makes sense to use with particleRadius: we want the top third to be populated with particles.
    float volumeTotal = mParametersSimulation->gridParticleSystem.getWorldVolume();
    // Abstract particles to boxes, as that's how they'll be arranged
    float volumeParticle = pow(mParametersSimulation->particleRadius*2, 3);
    // Use as many particles as needed, but never more than, say, 32k. 64k might also work, but we don't need the density for a 32m^3 scanVolume.
    mParametersSimulation->particleCount = (volumeTotal / volumeParticle) / 3;
    if(mParametersSimulation->particleCount > 32768) mParametersSimulation->particleCount = 32768;

    // Set gridsize so that the cell-edges are never shorter than the particle's diameter! If particles were allowed to be larger than gridcells in
    // any dimension, we couldn't find all relevant neighbors by searching through only the (3*3*3)-1 = 26 cells immediately neighboring this one.
    // We can also modify this compromise by allowing larger particles and then searching (5*5*5)-1 = 124 cells. Haven't tried.

    // Using less (larger) cells is possible, it means less memory being used for the grid, but more search being done when searching for neighbors
    // because more particles now occupy a single grid cell. This might not hurt performance as long as we do not cross an unknown threshold (kernel swap size)?!
    const QVector3D particleSystemWorldSize = CudaHelper::convert(mParametersSimulation->gridParticleSystem.getWorldSize());
    mParametersSimulation->gridParticleSystem.cells.x = nextHigherPowerOfTwo(particleSystemWorldSize.x() / mParametersSimulation->particleRadius) / 2;
    mParametersSimulation->gridParticleSystem.cells.x = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.x, 128);

    mParametersSimulation->gridParticleSystem.cells.y = nextHigherPowerOfTwo(particleSystemWorldSize.y() / mParametersSimulation->particleRadius) / 2;
    mParametersSimulation->gridParticleSystem.cells.y = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.y, 128);

    mParametersSimulation->gridParticleSystem.cells.z = nextHigherPowerOfTwo(particleSystemWorldSize.z() / mParametersSimulation->particleRadius) / 2;
    mParametersSimulation->gridParticleSystem.cells.z = qBound(2, (int)mParametersSimulation->gridParticleSystem.cells.z, 128);

    const quint32 numberOfCells = mParametersSimulation->gridParticleSystem.getCellCount();
    // allocate host storage for particle positions and velocities, then set them to zero
    mHostParticlePos = new float[mParametersSimulation->particleCount * 4];
    mHostParticleVel = new float[mParametersSimulation->particleCount * 4];
    mNumberOfBytesAllocatedCpu += mParametersSimulation->particleCount * 8;

    memset(mHostParticlePos, 0, mParametersSimulation->particleCount * 4 * sizeof(float));
    memset(mHostParticleVel, 0, mParametersSimulation->particleCount * 4 * sizeof(float));

    // determine GPU data-size
    const quint32 memSizeParticleQuadrupels = sizeof(float) * 4 * mParametersSimulation->particleCount;

    // Allocate GPU data
    // Create VBO with particle positions. This is later given to particle renderer for visualization
    mVboParticlePositions = OpenGlUtilities::createVbo(memSizeParticleQuadrupels);
    mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels;
    cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceParticlePositions, mVboParticlePositions, cudaGraphicsMapFlagsNone));

    // Create VBO with collider positions. This is later given to particle renderer for visualization
    mVboColliderPositions = mPointCloudColliders->getRenderInfo()->at(0)->vbo;//OpenGlUtilities::createVbo(sizeof(float) * 4 * mSimulationParameters->colliderCountMax);
    //qDebug() << "vbo colliderpos is" << mVboColliderPositions;
    cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceColliderPositions, mVboColliderPositions, cudaGraphicsMapFlagsNone));
    // use vboInfo.size or mSimulationParameters->colliderCountMax?

    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleVel, memSizeParticleQuadrupels));
    mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels;

    cudaSafeCall(cudaMalloc((void**)&mDeviceColliderSortedPos, sizeof(float) * 4 * mPointCloudColliders->getCapacity()));
    mNumberOfBytesAllocatedGpu += sizeof(float) * 4 * mPointCloudColliders->getCapacity();

    // Here, we store the positions of each particle's last collision with the colliders
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCollisionPositions, sizeof(float) * 4 * mParametersSimulation->particleCount));
    cudaSafeCall(cudaMemset(mDeviceParticleCollisionPositions, 0, sizeof(float) * 4 * mParametersSimulation->particleCount)); // set to (float)-zero
    mNumberOfBytesAllocatedGpu += sizeof(float) * 4 * mParametersSimulation->particleCount;

    // Sorted according to containing grid cell.
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleSortedPos, memSizeParticleQuadrupels));
    mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels;
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleSortedVel, memSizeParticleQuadrupels));
    mNumberOfBytesAllocatedGpu += memSizeParticleQuadrupels;

    // These two are used to map from gridcell (=hash) to particle id (=index). If we also know in which
    // indices of these arrays grid cells start and end, we can quickly find particles in neighboring cells...
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleMapGridCell, mParametersSimulation->particleCount*sizeof(uint)));
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleMapIndex, mParametersSimulation->particleCount*sizeof(uint)));
    mNumberOfBytesAllocatedGpu += mParametersSimulation->particleCount * sizeof(uint) * 2;

    // Same thing as above, just for colliders
    cudaSafeCall(cudaMalloc((void**)&mDeviceColliderMapGridCell, mPointCloudColliders->getCapacity()*sizeof(uint)));
    cudaSafeCall(cudaMalloc((void**)&mDeviceColliderMapIndex, mPointCloudColliders->getCapacity()*sizeof(uint)));
    mNumberOfBytesAllocatedGpu += mPointCloudColliders->getCapacity() * sizeof(uint) * 2;

    // ...and thats what we do here: in mDeviceCellStart[17], you'll find
    // where in mDeviceGridParticleHash cell 17 starts!
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCellStart, numberOfCells*sizeof(uint)));
    cudaSafeCall(cudaMalloc((void**)&mDeviceParticleCellEnd, numberOfCells*sizeof(uint)));
    mNumberOfBytesAllocatedGpu += numberOfCells * sizeof(uint) * 2;

    // Same for colliders...
    cudaSafeCall(cudaMalloc((void**)&mDeviceColliderCellStart, numberOfCells*sizeof(uint)));
    cudaSafeCall(cudaMalloc((void**)&mDeviceColliderCellEnd, numberOfCells*sizeof(uint)));
    mNumberOfBytesAllocatedGpu += numberOfCells * sizeof(uint) * 2;

    qDebug() << __PRETTY_FUNCTION__ << "worldsize" << CudaHelper::convert(mParametersSimulation->gridParticleSystem.getWorldSize()) << "and particle radius" << mParametersSimulation->particleRadius << ": created system with" << mParametersSimulation->particleCount << "particles and" << mParametersSimulation->gridParticleSystem.cells.x << "*" << mParametersSimulation->gridParticleSystem.cells.y << "*" << mParametersSimulation->gridParticleSystem.cells.z << "cells";
    qDebug() << __PRETTY_FUNCTION__ << "allocated" << mNumberOfBytesAllocatedCpu << "bytes on CPU," << mNumberOfBytesAllocatedGpu << "bytes on GPU.";

    copyParametersToGpu(mParametersSimulation);

    mIsInitialized = true;

    slotResetParticles();

    slotEmitVboInfoAndParameters();

    cudaSafeCall(cudaMemGetInfo(&memFree, &memTotal));
    qDebug() << __PRETTY_FUNCTION__ << "after init, device has" << memFree / 1048576 << "of" << memTotal / 1048576 << "mb free.";
}
예제 #21
0
void kf::cuda::waitAllDefaultStream()
{
    cudaSafeCall(cudaDeviceSynchronize() );
}
예제 #22
0
파일: common.hpp 프로젝트: 112000/opencv
 template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
 {
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
     cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
 }
예제 #23
0
void vm::scanner::cuda::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
{
    create(sizeBytes_arg);
    cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
    cudaSafeCall( cudaDeviceSynchronize() );
}
예제 #24
0
 TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex)
 {
     cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
 }
예제 #25
0
 ~TextureBinder()
 {
     cudaSafeCall( cudaUnbindTexture(texref) );
 }
예제 #26
0
 TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
 {
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
     cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
 }
예제 #27
0
 TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
 {
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
     cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
 }
예제 #28
0
void vm::scanner::cuda::waitAllDefaultStream()
{ cudaSafeCall(cudaDeviceSynchronize() ); }
예제 #29
0
LabelImage RandomForestImage::predict(const RGBDImage& image,
         cuv::ndarray<float, cuv::host_memory_space>* probabilities, const bool onGPU, bool useDepthImages) const {

    LabelImage prediction(image.getWidth(), image.getHeight());

    const LabelType numClasses = getNumClasses();

    if (treeData.size() != ensemble.size()) {
        throw std::runtime_error((boost::format("tree data size: %d, ensemble size: %d. histograms normalized?")
                % treeData.size() % ensemble.size()).str());
    }

    cuv::ndarray<float, cuv::host_memory_space> hostProbabilities(
            cuv::extents[numClasses][image.getHeight()][image.getWidth()],
            m_predictionAllocator);

    if (onGPU) {
        cuv::ndarray<float, cuv::dev_memory_space> deviceProbabilities(
                cuv::extents[numClasses][image.getHeight()][image.getWidth()],
                m_predictionAllocator);
        cudaSafeCall(cudaMemset(deviceProbabilities.ptr(), 0, static_cast<size_t>(deviceProbabilities.size() * sizeof(float))));

        {
            utils::Profile profile("classifyImagesGPU");
            for (const boost::shared_ptr<const TreeNodes>& data : treeData) {
                classifyImage(treeData.size(), deviceProbabilities, image, numClasses, data, useDepthImages);
            }
        }

        normalizeProbabilities(deviceProbabilities);

        cuv::ndarray<LabelType, cuv::dev_memory_space> output(image.getHeight(), image.getWidth(),
                m_predictionAllocator);
        determineMaxProbabilities(deviceProbabilities, output);

        hostProbabilities = deviceProbabilities;
        cuv::ndarray<LabelType, cuv::host_memory_space> outputHost(image.getHeight(), image.getWidth(),
                m_predictionAllocator);

        outputHost = output;

        {
            utils::Profile profile("setLabels");
            for (int y = 0; y < image.getHeight(); ++y) {
                for (int x = 0; x < image.getWidth(); ++x) {
                    prediction.setLabel(x, y, static_cast<LabelType>(outputHost(y, x)));
                }
            }
        }
    } else {
        utils::Profile profile("classifyImagesCPU");

        tbb::parallel_for(tbb::blocked_range<size_t>(0, image.getHeight()),
                [&](const tbb::blocked_range<size_t>& range) {
                    for(size_t y = range.begin(); y != range.end(); y++) {
                        for(int x=0; x < image.getWidth(); x++) {

                            for (LabelType label = 0; label < numClasses; label++) {
                                hostProbabilities(label, y, x) = 0.0f;
                            }

                            for (const auto& tree : ensemble) {
                                const auto& t = tree->getTree();
                                PixelInstance pixel(&image, 0, x, y);
                                const auto& hist = t->classifySoft(pixel);
                                assert(hist.size() == numClasses);
                                for(LabelType label = 0; label<hist.size(); label++) {
                                    hostProbabilities(label, y, x) += hist[label];
                                }
                            }

                            double sum = 0.0f;
                            for (LabelType label = 0; label < numClasses; label++) {
                                sum += hostProbabilities(label, y, x);
                            }
                            float bestProb = -1.0f;
                            for (LabelType label = 0; label < numClasses; label++) {
                                hostProbabilities(label, y, x) /= sum;
                                float prob = hostProbabilities(label, y, x);
                                if (prob > bestProb) {
                                    prediction.setLabel(x, y, label);
                                    bestProb = prob;
                                }
                            }
                        }
                    }
                });
    }

    if (probabilities) {
        *probabilities = hostProbabilities;
    }

    return prediction;
}
예제 #30
-1
LabelImage RandomForestImage::improveHistograms(const RGBDImage& image, const LabelImage& labelImage, const bool onGPU, bool useDepthImages) const {

    LabelImage prediction(image.getWidth(), image.getHeight());

    const LabelType numClasses = getNumClasses();

    if (treeData.size() != ensemble.size()) {
        throw std::runtime_error((boost::format("tree data size: %d, ensemble size: %d. histograms normalized?")
                % treeData.size() % ensemble.size()).str());
    }

    cuv::ndarray<float, cuv::host_memory_space> hostProbabilities(
            cuv::extents[numClasses][image.getHeight()][image.getWidth()],
            m_predictionAllocator);

    //These offsets should have been used instead of traversing to the leaf again
/*	cuv::ndarray<unsigned int, cuv::dev_memory_space> nodeOffsets(
			cuv::extents[image.getHeight()][image.getWidth()],
			m_predictionAllocator);
*/

    if (onGPU) {
        cuv::ndarray<float, cuv::dev_memory_space> deviceProbabilities(
                cuv::extents[numClasses][image.getHeight()][image.getWidth()],
                m_predictionAllocator);
        cudaSafeCall(cudaMemset(deviceProbabilities.ptr(), 0, static_cast<size_t>(deviceProbabilities.size() * sizeof(float))));

        {
            utils::Profile profile("classifyImagesGPU");
            for (const boost::shared_ptr<const TreeNodes>& data : treeData) {
                classifyImage(treeData.size(), deviceProbabilities, image, numClasses, data, useDepthImages);
                bool found_tree = false;
				//should be change to parallel for and add lock
				for (size_t treeNr = 0; treeNr < ensemble.size(); treeNr++) {
					if (data->getTreeId() == ensemble[treeNr]->getId()) {
						found_tree  =true;
						const boost::shared_ptr<RandomTree<PixelInstance, ImageFeatureFunction> >& tree = ensemble[treeNr]->getTree();
						//this should have been used and done before trying to classify the images, since it doesn't change
						//std::vector<size_t> leafSet;
						//tree->collectLeafNodes(leafSet);
						for (int y = 0; y < image.getHeight(); y++)
							for (int x = 0; x < image.getWidth(); x++) {
								LabelType label = labelImage.getLabel(x,y);
									if (!shouldIgnoreLabel(label)) {
										PixelInstance pixel(&image, label, x, y);
										//This should be changed. When classifying the image, the nodeoffsets should be returned and those used directly
										//instead of traversing again to the leaves. As a test, can check if the nodeoffset is the same as the one returned
										//by travertoleaf
										tree->setAllPixelsHistogram(pixel);
					                }
							}
					}
					if (found_tree)
						break;
				}
            }
        }

    }
    //should also add the CPU code!
    return prediction;
}