Exemple #1
0
inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){
  size_t pitch;
  // common choice for cuda mem align unit is 32
  if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){
    cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
       obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
    utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
    obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
  }else{
    obj.shape.stride_ = obj.shape[0];
    cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
        obj.shape.Size() * sizeof(real_t), 1 );
    utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
  }
}
void CudaUtil::cudaCheckMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height, int line, const char* file)
{
	int error = cudaMallocPitch(ptr, pitch, width, height);
    if (error != cudaSuccess) {
    	std::ostringstream os;
    	os << "cudaMallocPitch returned error code " << error << ", line " << line << ", in file " << file;
    	throw CudaException(os.str());
    }
}
Exemple #3
0
void ScanPlan::allocate(size_t elemSizeBytes, size_t numElements, size_t numRows, size_t rowPitch)
{
    const size_t blockSize = SCAN_ELTS_PER_THREAD * SCAN_CTA_SIZE;

    m_numElements = numElements;
    m_numRows = numRows;
    m_elemSizeBytes = elemSizeBytes;

    // find required number of levels
    size_t level = 0;
    size_t numElts = m_numElements;
    do
    {
        size_t numBlocks = (numElts + blockSize - 1) / blockSize;
        if (numBlocks > 1)
        {
            level++;
        }
        numElts = numBlocks;
    } while (numElts > 1);

    m_numLevels = level;

    m_blockSums = (void**) malloc(m_numLevels * sizeof(void*));

    if (m_numRows > 1)
    {
        m_rowPitches = (size_t*) malloc((m_numLevels + 1) * sizeof(size_t));
        m_rowPitches[0] = rowPitch;
    }

    // allocate storage for block sums
    numElts = m_numElements;
    level = 0;
    do
    {
        size_t numBlocks = (numElts + blockSize - 1) / blockSize;
        if (numBlocks > 1) 
        {
            // Use cudaMallocPitch for multi-row block sums to ensure alignment
            if (m_numRows > 1)
            {
                size_t dpitch;
                cudaSafeCall(cudaMallocPitch((void**)&(m_blockSums[level]), &dpitch, numBlocks * m_elemSizeBytes, numRows));
                m_rowPitches[level+1] = dpitch / m_elemSizeBytes;
            }
            else
            {
                cudaSafeCall(cudaMalloc((void**)&(m_blockSums[level]), numBlocks * m_elemSizeBytes));
            }
            level++;
        }
        numElts = numBlocks;
    } while (numElts > 1);

    cudaCheckMsg("ScanPlan::allocate");
}
	RenderTarget::RenderTarget(COM::size_t width, COM::size_t height)
		: _texture(width, height, GL_RGBA32F, GL_RGBA, GL_FLOAT)
	{
		int sdfheight = _texture.Height();
		CUDA_CALL(cudaMallocPitch((void**)&_deviceMem, &_pitch,
			width * sizeof(float) * 4, height));

		CUDA_CALL(cudaGraphicsGLRegisterImage(&_resource, _texture.GetID(),
			GL_TEXTURE_2D, cudaGraphicsMapFlagsNone));
	}
Exemple #5
0
lcudaMatrix lcudaAllocMatrix(int width, int height)
{
	lcudaMatrix matrix;
    cudaMallocPitch((void **)&matrix.data, (size_t*)&matrix.pitch, width * sizeof(lcudaFloat), height);
	matrix.width = width;
	matrix.height = height;


	return matrix;
}
// This test specifies a single test (where you specify radius and/or iterations)
int runSingleTest(char *ref_file, char *exec_path)
{
    int nTotalErrors = 0;
    char dump_file[256];

    printf("[runSingleTest]: [%s]\n", sSDKsample);

    initCuda();

    unsigned int *dResult;
    unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int));
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));

    // run the sample radius
    {
        printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations);
        bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

        // check if kernel execution generated an error
        getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
        checkCudaErrors(cudaDeviceSynchronize());

        // readback the results to system memory
        cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch,
                     sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost);

        sprintf(dump_file, "nature_%02d.ppm", filter_radius);

        sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height);

        if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false))
        {
            printf("Image is Different ");
            nTotalErrors++;
        }
        else
        {
            printf("Image is Matching ");
        }

        printf(" <%s>\n", ref_file);
    }
    printf("\n");

    free(hResult);
    checkCudaErrors(cudaFree(dResult));

    return nTotalErrors;
}
Exemple #7
0
HRESULT RegisterD3D9ResourceWithCUDA()
{
    // 2D
    // register the Direct3D resources that we'll use
    // we'll read to and write from g_texture_2d, so don't set any special map flags for it
    cudaGraphicsD3D9RegisterResource(&g_texture_2d.cudaResource, g_texture_2d.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_2d) failed");
    // cuda cannot write into the texture directly : the texture is seen as a cudaArray and can only be mapped as a texture
    // Create a buffer so that cuda can write into it
    // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT
    cudaMallocPitch(&g_texture_2d.cudaLinearMemory, &g_texture_2d.pitch, g_texture_2d.width * sizeof(float) * 4, g_texture_2d.height);
    getLastCudaError("cudaMallocPitch (g_texture_2d) failed");
    cudaMemset(g_texture_2d.cudaLinearMemory, 1, g_texture_2d.pitch * g_texture_2d.height);

    // CUBE
    cudaGraphicsD3D9RegisterResource(&g_texture_cube.cudaResource, g_texture_cube.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_cube) failed");
    // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM
    cudaMallocPitch(&g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, g_texture_cube.size * 4, g_texture_cube.size);
    getLastCudaError("cudaMallocPitch (g_texture_cube) failed");
    cudaMemset(g_texture_cube.cudaLinearMemory, 1, g_texture_cube.pitch * g_texture_cube.size);
    getLastCudaError("cudaMemset (g_texture_cube) failed");

    // 3D
    cudaGraphicsD3D9RegisterResource(&g_texture_vol.cudaResource, g_texture_vol.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_vol) failed");
    // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM
    //cudaMallocPitch(&g_texture_vol.cudaLinearMemory, &g_texture_vol.pitch, g_texture_vol.width * 4, g_texture_vol.height * g_texture_vol.depth);
    cudaMalloc(&g_texture_vol.cudaLinearMemory, g_texture_vol.width * 4 * g_texture_vol.height * g_texture_vol.depth);
    g_texture_vol.pitch = g_texture_vol.width * 4;
    getLastCudaError("cudaMallocPitch (g_texture_vol) failed");
    cudaMemset(g_texture_vol.cudaLinearMemory, 1, g_texture_vol.pitch * g_texture_vol.height * g_texture_vol.depth);
    getLastCudaError("cudaMemset (g_texture_vol) failed");

    return S_OK;
}
Exemple #8
0
GLFluids::GLFluids(QWidget *parent)
    : QGLWidget(parent),
      QGLFunctions()
{
    vbo = 0;

    wWidth = qMax(512, DIM);
    wHeight = qMax(512, DIM);

    hvfield = (float2 *)malloc(sizeof(float2) * DS);
    memset(hvfield, 0, sizeof(float2) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(float2)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(float2) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(float2) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(float2) * PDS);

    setup_texture(DIM, DIM);
    bind_texture();

    // Create particle array
    particles = (float2 *)malloc(sizeof(float2) * DS);
    memset(particles, 0, sizeof(float2) * DS);
    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);

    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    QTimer *timer = new QTimer(this);
    connect(timer, &QTimer::timeout, [&](){
        simulateFluids();
        updateGL();
    });
    timer->start(0);
}
Exemple #9
0
boost::shared_ptr<DeviceMatrix> makeDeviceMatrix(size_t height,	size_t width)
{
		DeviceMatrix* mat = new DeviceMatrix();
		mat->width = width;
		mat->height = height;
		CUDA_CALL
			(cudaMallocPitch((void**)&mat->data, &mat->pitch,
			mat->width * sizeof(float),
			mat->height));

		// I can't imagine getting a pitch that's not a multiple of a float
		assert(mat->pitch % sizeof(float) == 0);
		// We want to express everything in floats
		mat->pitch /= sizeof(float);

		//printf("cudaMalloc: %p\n", mat->data);

		return boost::shared_ptr<DeviceMatrix>(mat, deleteDeviceMatrix);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runBenchmark(int argc, char **argv)
{
    printf("[runBenchmark]: [%s]\n", sSDKsample);

    loadImageData(argc, argv);
    initCuda();

    unsigned int *dResult;
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));
    sdkStartTimer(&kernel_timer);

    // warm-up
    bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    checkCudaErrors(cudaDeviceSynchronize());

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles);

    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    }

    // check if kernel execution generated an error and sync host
    getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&kernel_timer);

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n",
           (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1);
    printf("\n");

    return 0;
}
Exemple #11
0
cudaError_t GridGpu::copyTrajBlocks()
{
    // Copy gridding k-trajectory data
    int maxP = 0;
    for (int i = 0; i < m_trajBlocks.size(); i++) {
        if (m_trajBlocks[i].size() > maxP) maxP = m_trajBlocks[i].size();
    }
    m_d_trajBlocks.trajWidth = maxP;

    cudaMallocPitch(&m_d_trajBlocks.trajPoints, &m_d_trajBlocks.pitchTraj, maxP * sizeof(TrajPointGpu), m_trajBlocks.size());
    cudaMemset(m_d_trajBlocks.trajPoints, 0, m_d_trajBlocks.pitchTraj * m_trajBlocks.size());
    qWarning() << "Max traj points per block:" << maxP;

    for (int i = 0; i < m_trajBlocks.size(); i++) {
        char *row = (char *)m_d_trajBlocks.trajPoints + i * m_d_trajBlocks.pitchTraj;
        cudaMemcpy(row, m_trajBlocks[i].data(), m_trajBlocks[i].size() * sizeof(TrajPointGpu), cudaMemcpyHostToDevice);
    }

    return cudaGetLastError();
}
void vm::scanner::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg)
{
    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
        return;
            
    if( rows_arg > 0 && colsBytes_arg > 0)
    {        
        if( data_ )
            release();
              
        colsBytes_ = colsBytes_arg;
        rows_ = rows_arg;
                        
        cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) );        

        //refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
Exemple #13
0
void pcl::gpu::DeviceMemory2D::create(int rows_arg, int colsBytes_arg)
{
    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
        return;

    if( rows_arg > 0 && colsBytes_arg > 0)
    {
        if( data_ )
            release();

        colsBytes_ = colsBytes_arg;
        rows_ = rows_arg;
        printf( "[CUDA] Allocating memory %d x %d = %d bytes.\n", colsBytes_, rows_, colsBytes_ * rows_ );

        cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) );

        //refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        refcount_ = new int;
        *refcount_ = 1;
    }
}
Exemple #14
0
DeviceMatrix3D::Ptr makeDeviceMatrix3D(size_t dim_t, size_t dim_y, 
	size_t dim_x){
		DeviceMatrix3D* mat = new DeviceMatrix3D();
		mat->dim_x = dim_x;
		mat->dim_y = dim_y;
		mat->dim_t = dim_t;
		size_t pitch;
		CUDA_CALL
			(cudaMallocPitch((void**)&mat->data, &pitch,
			dim_x * sizeof(float),
			dim_y * dim_t));
		// I can't imagine getting a pitch that's not a multiple of a float
		assert(pitch % sizeof(float) == 0);
		// We want to express everything in floats
		pitch /= sizeof(float);

		mat->pitch_y = pitch;
		mat->pitch_t = dim_y*mat->pitch_y;

		return DeviceMatrix3D::Ptr(mat, deleteDeviceMatrix3D);
}
Exemple #15
0
// WARNING: ignorePitch = true should only be used for testing!
void CuWrapper::CuInit(int device, bool ignorePitch)
{
	if(!CuInitialized)
	{
		Generic::Print("Initializing device");

		CuExe(cudaSetDevice(device), "cudaSetDevice failed, no device found");
		CuExe(cudaGetDeviceProperties(&CuProperties, 0), "Getting device properties failed");

		// Make sure the default block size does not exceed the maximum of the device
		if(CuBlockDim.x * CuBlockDim.y * CuBlockDim.z > CuProperties.maxThreadsPerBlock)
		{
			int dim = (int)floor(sqrt((float)CuProperties.maxThreadsPerBlock));
			dim = dim > 16 ? dim - dim % 16 : dim;
			CuBlockDim.x = dim;
			CuBlockDim.y = dim;
			CuBlockDim.z = 1;
		}

		// Determine the device pitch in bytes by allocating an integer array of dimension 1x1.
		// Use this to pad matrices on the host
		if(ignorePitch)
		{
			// WARNING: this will only make CuGetPitchSize work for CU_TYPE! IgnorePitch should only be used for testing
			CuPitchBytes = sizeof(CU_TYPE);
		}
		else
		{
			int *tmp;
			CuExe(cudaMallocPitch((void**)&tmp, &CuPitchBytes, sizeof(int), 1));
			CuExe(cudaFree(tmp));
		}

		CuInitialized = true;
	}
}
Exemple #16
0
void run_2D_GLOBAL_MEMORY()
{
	int arrayWidth	= 4;
	int arrayHeight	= 4;

	bool SEQ = true; 

	/* Host allocation */
	float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* outArr_H	= (float*) malloc(arrayWidth * arrayHeight * sizeof(float));

	/* Fill arrays */
	int index = 0; 
	if (SEQ)
	{
		int ctr = 0; 
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] = (float) ctr++;  
				inArr_2_H[index] = (float) ctr++; 
				outArr_H[index] = (float) 0;
			}
		}
	}
	else 
	{
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] =  (float)rand()/(float)RAND_MAX;
				inArr_2_H[index] = (float)rand()/(float)RAND_MAX;
				outArr_H[index] = 0; 
			}
		}
	}
	    
	/* Print host arrays */ 
	printf("inArr_1_H \n");
	print_2D_Array(inArr_1_H, arrayWidth, arrayHeight);
	printf("inArr_2_H \n");
	print_2D_Array(inArr_2_H, arrayWidth, arrayHeight);

	/* Device allocation + <__pitch> */
	float *inArr_1_D, *inArr_2_D, *outArr_D;
	size_t __pitch;
	cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);

	/* Print __pitch */
	printf("__pitch %d \n", (__pitch/sizeof(float))); 

	/* Uploading data */ 
	cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
	cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
 
	/* Gridding */
	dim3 __numBlocks(1,1,1);
	dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
	__numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1));
	__numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1));

	/* Kernel invokation */
	add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock); 

	/* Synchronization */
	cudaThreadSynchronize();

	/* Download result */
	cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost);

	/* Free device arrays */
	cudaFree(inArr_1_D);
	cudaFree(inArr_2_D);
	cudaFree(outArr_D);

	/* Display results */
	printf("outArr \n");
	print_2D_Array(outArr_H, arrayWidth, arrayHeight);
}
Exemple #17
0
////////////////////////////////////////////////////////////////////////////////
// Program Main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
	int Nx, Ny, Nz, max_iters;
	int blockX, blockY, blockZ;

	if (argc == 8) {
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", 
			argv[0]);
		exit(1);
	}

	// Get the number of GPUS
	int number_of_devices;
	checkCuda(cudaGetDeviceCount(&number_of_devices));
  
  if (number_of_devices < 2) {
  	printf("Less than two devices were found.\n");
  	printf("Exiting...\n");

  	return -1;
  }

	// Decompose along the Z-axis
	int _Nz = Nz/number_of_devices;

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Check if ECC is turned on
	ECCCheck(number_of_devices);

	// Set the number of OpenMP threads
	omp_set_num_threads(number_of_devices);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_num_threads();

		#pragma omp single
		{
			printf("Number of OpenMP threads: %d\n", tid);
		}
	}

  // CPU memory operations
  int dt_size = sizeof(_DOUBLE_);

	_DOUBLE_ *u_new, *u_old;

	u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(u_old, u_new, h, Nx, Ny, Nz);

	// Allocate and generate arrays on the host
	size_t pitch_bytes;
	size_t pitch_gc_bytes;

	_DOUBLE_ *h_Unew, *h_Uold;
	_DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices];
	_DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices];
	_DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices];

	h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(h_Uold, h_Unew, h, Nx, Ny, Nz);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
		h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

		right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

		checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

		init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

	// GPU memory operations
	_DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices];
	_DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices];
	_DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices];

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		checkCuda(cudaSetDevice(tid));

		CopyToConstantMemory(c0, c1);

		checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
	}

	// Copy data from host to the device
	double HtD_timer = 0.;
	HtD_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
		checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
	}
	HtD_timer += omp_get_wtime();

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

    // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);
	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	double compute_timer = 0.;
  compute_timer -= omp_get_wtime();

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		for(int iterations = 0; iterations < max_iters; iterations++)
		{
			// Compute inner nodes
			checkCuda(cudaSetDevice(tid));
			ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz);

			// Copy right boundary data to host
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
				checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}
			// Copy left boundary data to host
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
				checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}

			#pragma omp barrier

			// Copy right boundary data to device 1
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				
				checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
			}

			// Copy left boundary data to device 0
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));

				checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
			}

			// Swap pointers on the host
			#pragma omp barrier
			checkCuda(cudaSetDevice(tid));
			checkCuda(cudaDeviceSynchronize());
			swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]);
		}
	}

	compute_timer += omp_get_wtime();

  // Copy data from device to host
	double DtH_timer = 0;
  DtH_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost));
	}
	DtH_timer += omp_get_wtime();

	// Merge sub-domains into a one big domain
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

   	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
#endif

    float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
    PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

    _DOUBLE_ t = max_iters * dt;
    CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);

#if defined(DEBUG) || defined(_DEBUG)
    //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz);
#endif

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaFree(d_s_Unews[tid]));
    checkCuda(cudaFree(d_s_Uolds[tid]));
    checkCuda(cudaFree(d_right_send_buffer[tid]));
    checkCuda(cudaFree(d_left_send_buffer[tid]));
    checkCuda(cudaFree(d_right_receive_buffer[tid]));
    checkCuda(cudaFree(d_left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(h_s_Unews[tid]));
    checkCuda(cudaFreeHost(h_s_Uolds[tid]));
    checkCuda(cudaFreeHost(left_send_buffer[tid]));
    checkCuda(cudaFreeHost(right_send_buffer[tid]));
    checkCuda(cudaFreeHost(left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(right_receive_buffer[tid]));
    checkCuda(cudaDeviceReset());
  }

  free(u_old);
  free(u_new);

	return 0;
}
void run_back_projection_with_normal_estimate(
	std::vector<float4>& vertices,
	std::vector<float4>& normals,
	const std::vector<ushort>& depth_buffer,
	uint width,
	uint height,
	ushort max_depth)
{
	StopWatchInterface *kernel_timer = nullptr;

	ushort* h_depth_buffer = (ushort*)depth_buffer.data();

	size_t in_pitch, out_pitch;

	ushort* d_depth_buffer = nullptr;
	// copy image data to array
	checkCudaErrors(cudaMallocPitch(&d_depth_buffer, &in_pitch, sizeof(ushort) * width, height));
	checkCudaErrors(cudaMemcpy2D(
		d_depth_buffer,
		in_pitch,
		h_depth_buffer,
		sizeof(ushort) * width,
		sizeof(ushort) * width,
		height,
		cudaMemcpyHostToDevice));



	float4* d_vertex_buffer;
	checkCudaErrors(cudaMallocPitch(
		&d_vertex_buffer,
		&out_pitch,
		width * sizeof(float4),
		height));

	float4* d_normal_buffer;
	checkCudaErrors(cudaMallocPitch(
		&d_normal_buffer,
		&out_pitch,
		width * sizeof(float4),
		height));


	sdkCreateTimer(&kernel_timer);
	sdkStartTimer(&kernel_timer);

	Eigen::Matrix4f h_inverse_projection = perspective_matrix_inverse<float>(fov_y, aspect_ratio, near_plane, far_plane);
	//bilateralFilter_normal_estimate_float4((OutputPixelType*)dOutputImage, (InputPixelType*)dInputImage, width, height, in_pitch, out_pitch, max_depth, euclidean_delta, filter_radius, iterations, kernel_timer);
	back_projection_with_normal_estimation(d_vertex_buffer, d_normal_buffer, d_depth_buffer, width, height, max_depth, in_pitch, out_pitch, h_inverse_projection.data());

	checkCudaErrors(cudaDeviceSynchronize());
	sdkStopTimer(&kernel_timer);
	std::cout << "Kernel Timer                              : " << kernel_timer->getTime() << " msec" << std::endl;
	sdkDeleteTimer(&kernel_timer);

	vertices.resize(depth_buffer.size());
	normals.resize(depth_buffer.size());

	cudaMemcpy2D(
		vertices.data(),
		sizeof(float4) * width,
		d_vertex_buffer,
		out_pitch,
		sizeof(float4) * width,
		height,
		cudaMemcpyDeviceToHost);

	cudaMemcpy2D(
		normals.data(),
		sizeof(float4) * width,
		d_normal_buffer,
		out_pitch,
		sizeof(float4) * width,
		height,
		cudaMemcpyDeviceToHost);


	checkCudaErrors(cudaFree(d_depth_buffer));
	checkCudaErrors(cudaFree(d_vertex_buffer));
	checkCudaErrors(cudaFree(d_normal_buffer));
}
Exemple #19
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
#ifndef OPTIMUS
    devID = findCudaGLDevice(argc, (const char **)argv);
#else
    devID = gpuGetMaxGflopsDeviceId();
#endif

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array in host memory
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

#ifdef BROADCAST
	int step = 1;

	// Broadcasted visualization stepping.
	if (argc > 3)
		step = atoi(argv[3]);

	// Create additional space to store particle packets
	// for broadcasting.
	wstep = step; hstep = step;
	int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize;
	if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize)
		npackets++;
	packets = (char*)malloc(npackets *
		(UdpBroadcastServer::PacketSize + sizeof(unsigned int)));
#endif

    initParticles(particles, DIM, DIM);

#if defined(OPTIMUS) || defined(BROADCAST)
    // Create particle array in device memory
    cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS);
    cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice);
#endif

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

#ifndef OPTIMUS
    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
#endif

    if (ref_file)
    {
        autoTest(argv);
        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
#ifdef BROADCAST
		const char *sv_addr = "127.0.0:9097";
		const char *bc_addr = "127.255.255.2:9097";

		// Server address
		if (argc > 2)
			sv_addr = argv[2];

		// Broadcast address
		if (argc > 1)
			bc_addr = argv[1];

		server.reset(new UdpBroadcastServer(sv_addr, bc_addr));

		// Listen to clients' feedbacks in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &feedback_listener, &step);
		}

		// Broadcast the particles state in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &broadcaster, &step);
		}
#endif
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif
        glutMainLoop();
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Exemple #20
0
cudaError_t WINAPI wine_cudaMallocPitch( void** devPtr, size_t* pitch, size_t widthInBytes, size_t height ) {
    WINE_TRACE("\n");
    return cudaMallocPitch( devPtr, pitch, widthInBytes, height );
}
Exemple #21
0
///////////////////////
// Main program entry
///////////////////////
int main(int argc, char** argv)
{
	unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ;
	int rank, numberOfProcesses;

	if (argc == 8)
	{
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]);
		exit(1);
	}

  InitializeMPI(&argc, &argv, &rank, &numberOfProcesses);
  AssignDevices(rank);
  ECCCheck(rank);

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Copy constants to Constant Memory on the GPUs
	CopyToConstantMemory(c0, c1);

	// Decompose along the z-axis
	const int _Nz = Nz/numberOfProcesses;
  const int dt_size = sizeof(_DOUBLE_);

    // Host memory allocations
    _DOUBLE_ *u_new, *u_old;
    _DOUBLE_ *h_Uold;

    u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
    u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

    if (rank == 0)
    {
    	h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); 
    }

    init(u_old, u_new, h, Nx, Ny, Nz);

    // Allocate and generate host subdomains
    _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses];
    _DOUBLE_ *left_send_buffer, *left_receive_buffer;
    _DOUBLE_ *right_send_buffer, *right_receive_buffer;

    h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
    h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
    for (int i = 0; i < numberOfProcesses; i++)
    {
        h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
        checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    }
  }
#endif

    right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

    checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));

    checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

    init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank);

	// GPU stream operations
	cudaStream_t compute_stream;
	cudaStream_t data_stream;

	checkCuda(cudaStreamCreate(&compute_stream));
	checkCuda(cudaStreamCreate(&data_stream));

	// GPU Memory Operations
	size_t pitch_bytes, pitch_gc_bytes;

  _DOUBLE_ *d_s_Unews, *d_s_Uolds;
  _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer;
  _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer;

  checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
  checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));

  checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));

	// Copy subdomains from host to device and get walltime
	double HtD_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
  checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));

  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	unsigned int ghost_width = 1;

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

  // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);

	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	//MPI_Status status;
	MPI_Status status[numberOfProcesses];
	MPI_Request gather_send_request[numberOfProcesses];
	MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses];
	MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses];

	double compute_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  compute_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	for(unsigned int iterations = 0; iterations < max_iters; iterations++)
	{
		// Compute right boundary data on device 0
		if (rank == 0) {
      int kstart = (_Nz+1)-ghost_width;
	    int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
			
			checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank]));
		}
		else
		{
			int kstart = 1;
			int kstop = 1+ghost_width;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
			
			checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank]));
		}

		// Compute inner nodes for device 0
		if (rank == 0) {
			int kstart = 1;
			int kstop = (_Nz+1)-ghost_width;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}
		// Compute inner nodes for device 1
		else
		{
			int kstart = 1+ghost_width;
			int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}

		// Receive data from device 1
		if (rank == 0) {
			MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank]));
		}
		else
		{
			MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank]));
		}

		if (rank == 0) {
      MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
		}

		if (rank == 0)
		{
			MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE));
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE));
		}

		// Swap pointers on the host
		checkCuda(cudaDeviceSynchronize());
		swap(_DOUBLE_*, d_s_Unews, d_s_Uolds);
	}

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	compute_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	// Copy data from device to host
	double DtH_timer = 0;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  DtH_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault));

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	DtH_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  // Gather results from subdomains
  MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank]));

	if (rank == 0)
	{
		for (int i = 0; i < numberOfProcesses; i++)
		{
			MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank]));
			merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i);
		}
	}

	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	if (rank == 0)
	{
		cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
	}
#endif

	if (rank == 0)
	{
		float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
		PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

		_DOUBLE_ t = max_iters * dt;
		CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);
	}

	Finalize();

  // Free device memory
  checkCuda(cudaFree(d_s_Unews));
  checkCuda(cudaFree(d_s_Uolds));
  checkCuda(cudaFree(d_right_send_buffer));
  checkCuda(cudaFree(d_left_send_buffer));
  checkCuda(cudaFree(d_right_receive_buffer));
  checkCuda(cudaFree(d_left_receive_buffer));

  // Free host memory
  checkCuda(cudaFreeHost(h_s_Unews));
  checkCuda(cudaFreeHost(h_s_Uolds));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
  	for (int i = 0; i < numberOfProcesses; i++)
  	{
  		checkCuda(cudaFreeHost(h_s_rbuf[i]));
  	}

    free(h_Uold);
  }
#endif

  checkCuda(cudaFreeHost(left_send_buffer));
  checkCuda(cudaFreeHost(left_receive_buffer));
  checkCuda(cudaFreeHost(right_send_buffer));
  checkCuda(cudaFreeHost(right_receive_buffer));

  checkCuda(cudaDeviceReset());

  free(u_old);
  free(u_new);

  return 0;
}
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
                const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
                size_t coeffSize )
{
    CuScalarT *dev_in = NULL;
    size_t pitchIn;

    CuScalarT *dev_prev = NULL;
    size_t pitchPrev;

    CuScalarT *dev_out = NULL;
    size_t pitchOut;

    CuScalarT *dev_co = NULL;

    try
    {
        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
                                        coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );

        CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
        CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        if( dev_in )
        {
            CudaSafeCall( cudaFree( ( void* )dev_in ) );
        }
        if( dev_prev )
        {
            CudaSafeCall( cudaFree( ( void* )dev_prev ) );
        }
        if( dev_out )
        {
            CudaSafeCall( cudaFree( ( void* )dev_out ) );
        }
        if( dev_co )
        {
            CudaSafeCall( cudaFree( ( void* )dev_co ) );
        }
        throw WLBadAllocException( "Could not allocate CUDA memory!" );
    }

    size_t threadsPerBlock = 32;
    size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
    size_t sharedMem = coeffSize * sizeof( CuScalarT );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    cudaEventRecord( start, 0 );
    cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
                    pitchOut, pitchIn, pitchPrev );
    cudaError_t kernelError = cudaGetLastError();

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    try
    {
        if( kernelError != cudaSuccess )
        {
            const std::string err( cudaGetErrorString( kernelError ) );
            throw WException( "CUDA kernel failed: " + err );
        }
        CudaThrowsCall(
                        cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyDeviceToHost ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        elapsedTime = -1.0;
    }

    CudaSafeCall( cudaFree( ( void* )dev_in ) );
    CudaSafeCall( cudaFree( ( void* )dev_prev ) );
    CudaSafeCall( cudaFree( ( void* )dev_out ) );
    CudaSafeCall( cudaFree( ( void* )dev_co ) );

    if( elapsedTime > -1.0 )
    {
        return elapsedTime;
    }
    else
    {
        throw WException( "Error in cudaFilter()" );
    }
}
FrameSource::FrameStatus GStreamerBaseFrameSourceImpl::fetch(vx_image image, vx_uint32 /*timeout*/)
{
    if (end)
    {
        close();
        return FrameSource::CLOSED;
    }

    handleGStreamerMessages();

    if (gst_app_sink_is_eos(GST_APP_SINK(sink)))
    {
        close();
        return FrameSource::CLOSED;
    }

    if ((lastFrameTimestamp.toc()/1000.0) > Application::get().getSourceDefaultTimeout())
    {
        close();
        return FrameSource::CLOSED;
    }

    lastFrameTimestamp.tic();

#if GST_VERSION_MAJOR == 0
    std::unique_ptr<GstBuffer, GStreamerObjectDeleter> bufferHolder(
        gst_app_sink_pull_buffer(GST_APP_SINK(sink)));
    GstBuffer* buffer = bufferHolder.get();
#else
    std::unique_ptr<GstSample, GStreamerObjectDeleter> sample(gst_app_sink_pull_sample(GST_APP_SINK(sink)));

    if (!sample)
    {
        close();
        return FrameSource::CLOSED;
    }

    GstBuffer* buffer = gst_sample_get_buffer(sample.get());
#endif

    gint          width;
    gint          height;

#if GST_VERSION_MAJOR == 0
    std::unique_ptr<GstCaps, GStreamerObjectDeleter> bufferCapsHolder(gst_buffer_get_caps(buffer));
    GstCaps* bufferCaps = bufferCapsHolder.get();
#else
    GstCaps* bufferCaps = gst_sample_get_caps(sample.get());
#endif
    // bail out in no caps
    assert(gst_caps_get_size(bufferCaps) == 1);
    GstStructure* structure = gst_caps_get_structure(bufferCaps, 0);

    // bail out if width or height are 0
    if (!gst_structure_get_int(structure, "width", &width) ||
            !gst_structure_get_int(structure, "height", &height))
    {
        close();
        return FrameSource::CLOSED;
    }

    int depth = 3;
#if GST_VERSION_MAJOR > 0
    depth = 0;
    const gchar* name = gst_structure_get_name(structure);
    const gchar* format = gst_structure_get_string(structure, "format");

    if (!name || !format)
    {
        close();
        return FrameSource::CLOSED;
    }

    // we support 2 types of data:
    //     video/x-raw, format=BGR   -> 8bit, 3 channels
    //     video/x-raw, format=GRAY8 -> 8bit, 1 channel
    if (strcasecmp(name, "video/x-raw") == 0)
    {
        if (strcasecmp(format, "RGB") == 0)
        {
            depth = 3;
        }
        else if(strcasecmp(format, "GRAY8") == 0)
        {
            depth = 1;
        }
    }
#endif
    if (depth == 0)
    {
        close();
        return FrameSource::CLOSED;
    }

    vx_imagepatch_addressing_t decodedImageAddr;
    decodedImageAddr.dim_x = width;
    decodedImageAddr.dim_y = height;
    decodedImageAddr.stride_x = depth;
    // GStreamer uses as stride width rounded up to the nearest multiple of 4
    decodedImageAddr.stride_y = ((width*depth+3)/4)*4;
    decodedImageAddr.scale_x = 1;
    decodedImageAddr.scale_y = 1;
    vx_image decodedImage = NULL;
    vx_df_image_e vx_type_map[5] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_U8,
                                     VX_DF_IMAGE_VIRT, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX };

    // fetch image width and height
    vx_uint32 actual_width, actual_height;
    vx_df_image_e actual_format;
    NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_WIDTH, (void *)&actual_width, sizeof(actual_width)) );
    NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_HEIGHT, (void *)&actual_height, sizeof(actual_height)) );
    NVXIO_SAFE_CALL( vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&actual_format, sizeof(actual_format)) );
    bool needScale = width != (int)configuration.frameWidth || height != (int)configuration.frameHeight;

    // config and actual image sized must be the same!
    if ((actual_height != configuration.frameHeight) ||
            (actual_width != configuration.frameWidth) ||
            (actual_format != configuration.format))
    {
        close();

        NVXIO_THROW_EXCEPTION("Actual image [ " << actual_width << " x " << actual_height <<
                              " ] does not equal configuration one [ " << configuration.frameWidth
                              << " x " << configuration.frameHeight << " ]");
    }

    // we assume that decoced image will have no more than 3 channels per pixel
    if (!devMem)
    {
        NVXIO_ASSERT( cudaSuccess == cudaMallocPitch(&devMem, &devMemPitch, width * 3, height) );
    }

    // check if decoded image format has changed
    if (scaledImage)
    {
        vx_df_image_e scaled_format;
        NVXIO_SAFE_CALL( vxQueryImage(scaledImage, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&scaled_format, sizeof(scaled_format)) );

        if (scaled_format != vx_type_map[depth])
        {
            vxReleaseImage(&scaledImage);
            scaledImage = NULL;
        }
    }

    if (needScale && !scaledImage)
    {
        scaledImage = vxCreateImage(vxContext, configuration.frameWidth,
                                    configuration.frameHeight, vx_type_map[depth]);
        NVXIO_CHECK_REFERENCE( scaledImage );
    }

#if GST_VERSION_MAJOR == 0
    bool needConvert = configuration.format != VX_DF_IMAGE_RGB;
    void * decodedPtr = GST_BUFFER_DATA(buffer);
#else
    GstMapInfo info;

    gboolean success = gst_buffer_map(buffer, &info, (GstMapFlags)GST_MAP_READ);
    if (!success)
    {
        printf("GStreamer: unable to map buffer\n");
        close();
        return FrameSource::CLOSED;
    }

    bool needConvert = configuration.format != vx_type_map[depth];
    void * decodedPtr = info.data;
#endif

    if (!needConvert && !needScale)
    {
        decodedImage = vxCreateImageFromHandle(vxContext, vx_type_map[depth], &decodedImageAddr,
                                               &decodedPtr, VX_IMPORT_TYPE_HOST);
        NVXIO_CHECK_REFERENCE( decodedImage );
        NVXIO_SAFE_CALL( nvxuCopyImage(vxContext, decodedImage, image) );
    }
    else
    {
        // 1. upload decoced image to CUDA buffer
        NVXIO_ASSERT( cudaSuccess == cudaMemcpy2D(devMem, devMemPitch,
                                                  decodedPtr, decodedImageAddr.stride_y,
                                                  decodedImageAddr.dim_x * depth, decodedImageAddr.dim_y,
                                                  cudaMemcpyHostToDevice) );

        // 2. create vx_image wrapper for decoded buffer
        decodedImageAddr.stride_y = static_cast<vx_int32>(devMemPitch);
        decodedImage = vxCreateImageFromHandle(vxContext, vx_type_map[depth], &decodedImageAddr,
                                               &devMem, NVX_IMPORT_TYPE_CUDA);
        NVXIO_CHECK_REFERENCE( decodedImage );

        if (needScale)
        {
            // 3. scale image
            NVXIO_SAFE_CALL( vxuScaleImage(vxContext, decodedImage, scaledImage, VX_INTERPOLATION_TYPE_BILINEAR) );

            // 4. convert to dst image
            NVXIO_SAFE_CALL( vxuColorConvert(vxContext, scaledImage, image) );
        }
        else
        {
            // 3. convert to dst image
            NVXIO_SAFE_CALL( vxuColorConvert(vxContext, decodedImage, image) );
        }
    }

#if GST_VERSION_MAJOR != 0
    gst_buffer_unmap(buffer, &info);
#endif

    NVXIO_SAFE_CALL( vxReleaseImage(&decodedImage) );

    return FrameSource::OK;
}
Exemple #24
0
void InitCudaLayers()
{

	mmGridSizeX = sim_width/blockSizex;
	mmGridSizeY = sim_height/blockSizey;
	mmGridSize = mmGridSizeX*mmGridSizeY;
	memset(mmGrid, 0, sizeof(mmGrid));
	memset(mmYGGrid, 0, sizeof(mmYGGrid));

	tempHostData = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float));
	tempHostDataNoCuda = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float));
	grid8ValTick = (float*)malloc(sim_width*sim_height*8*sizeof(float));

	initColors();

	memset(gCudaLayer, 0, sizeof(gCudaLayer));
	memset(gCudaFuncLayer, 0, sizeof(gCudaFuncLayer));
	memset(gPhysLayer, 0, sizeof(gPhysLayer));
	memset(gStateLayer, 0, sizeof(gStateLayer));

	srand(0);
	int seed = rand();

	const cudaChannelFormatDesc desc4 = cudaCreateChannelDesc<float4>();
	cudaMallocArray(&gCudaVectArray, &desc4, sim_width, sim_height);
#if NFLAYERS ==2
	const cudaChannelFormatDesc desc2 = cudaCreateChannelDesc<float2>();
#else if NFLAYERS ==4
	const cudaChannelFormatDesc descF = desc4;
#endif
	cudaMallocArray(&gCudaFlArray, &descF, sim_width, sim_height);

	const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
	cudaMallocArray(&gCudaFuncWavePack, &desc, sim_width);
	cudaMallocArray(&gCudaFuncSmooth, &desc, sim_width);

	cudaMallocArray(&(gCudaLayer[0]), &desc, sim_width, sim_height);
	cudaMallocArray(&(gCudaLayer[1]), &desc, sim_width, sim_height);

	cudaMallocArray(&(gCudaFuncLayer[0]), &desc, sim_width, sim_height);

	cudaMalloc(&cuTempData, TEMP_SIZE*sizeof(float)*sim_width*sim_height);
	cudaMalloc(&cuRandArr, sizeof(unsigned int)*sim_width*sim_height);

	cudaMalloc(&gStateLayer[0], sim_rect*sizeof(float));
	cudaMemset(gStateLayer[0], 0, sim_rect*sizeof(float));
	cudaMalloc(&gStateLayer[1], sim_rect*sizeof(float));
	cudaMemset(gStateLayer[1], 0, sim_rect*sizeof(float));

	cudaMalloc(&gPhysLayer[0], sim_rect*sizeof(float));
	cudaMemset(gPhysLayer[0], 0, sim_rect*sizeof(float));
	cudaMalloc(&gPhysLayer[1], sim_rect*sizeof(float));
	cudaMemset(gPhysLayer[1], 0, sim_rect*sizeof(float));

	cudaMalloc(&gRedBlueField, NFLAYERS*sim_rect*sizeof(float));
	cudaMemset(gRedBlueField, 0, NFLAYERS*sim_rect*sizeof(float));

	size_t pitch = 4*sim_width*sizeof(float);
	cudaMallocPitch((void**)&gVectorLayer, &pitch, 4*sim_width*sizeof(float), sim_height);

	cudaMemset2D(gVectorLayer, 4*sim_width*sizeof(float), 0, 4*sim_width*sizeof(float), sim_height);	

	InitWavePack(32, 1.f, sim_width, sim_height, cuTempData, gCudaFuncWavePack);

	InitSmooth(1, sim_width, cuTempData, gCudaFuncSmooth);

	InitRnd2DInt(seed, cuRandArr, sim_width, sim_height);

	InitFuncLayer(gCudaFuncLayer[0], cuTempData, sim_width, sim_height);

	InitPhysLayer(gPhysLayer[0], gStateLayer[0], cuRandArr, sim_width, sim_height);

	float* gridIni = cuTempData+3*sim_rect/2;
	float* halfTemp = cuTempData + sim_rect;
	float* out = cuTempData + 2*sim_rect;
	cudaMemset(out, 0, sim_rect*sizeof(float));

	seed = rand();
	int gridx = INTERP_SIZEX;
	int gridy = INTERP_SIZEX;
	InitRnd2DF(seed, gridIni, gridx, gridy);
	float scaleadd = .7f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	seed = rand();
	gridx = (int)(gridx*2);
	gridy = (int)(gridy*2);
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .3f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);


	cudaMemcpyToArray(gCudaLayer[0], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice);

	cudaMemset(out, 0, sim_rect*sizeof(float));
	gridx = INTERP_SIZEX;
	gridy = INTERP_SIZEX;

	seed = rand();
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .7f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	seed = rand();
	gridx = (int)(gridx*1.5);
	gridy = (int)(gridy*1.5);
	InitRnd2DF(seed, gridIni, gridx, gridy);
	scaleadd = .3f;
	Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height);

	cudaMemcpyToArray(gCudaLayer[1], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice);

	float2 pos0;
	pos0.x = gObj0X;
	pos0.y = gObj0Y;

	float2 pos1;
	pos1.x = gObj1X;
	pos1.y = gObj1Y;

	gObjInertia.Init(pos0, pos1);

	LayerProc(sim_width, sim_height, gCudaLayer[0], gCudaFuncLayer[0], cuTempData, pos0.x , pos0.y, pos1.x , pos1.y);
	ParticleStateInit(cuTempData, cuRandArr, 
					   gStateLayer[0], gPhysLayer[0], gRedBlueField);

	InitBhv();

}
Exemple #25
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    devID = findCudaGLDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");

    if (ref_file)
    {
        autoTest(argv);
        cleanup();
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
        atexit(cleanup);
        glutMainLoop();
    }

    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Exemple #26
0
void convertFrame(vx_context vxContext,
                  vx_image frame,
                  const FrameSource::Parameters & configuration,
                  vx_imagepatch_addressing_t & decodedImageAddr,
                  void * decodedPtr,
                  bool is_cuda,
                  void *& devMem,
                  size_t & devMemPitch,
                  vx_image & scaledImage
                  )
{
    vx_df_image_e vx_type_map[5] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_U8,
                                     VX_DF_IMAGE_VIRT, VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX };
    vx_df_image_e decodedFormat = vx_type_map[decodedImageAddr.stride_x];

    // fetch image width and height
    vx_uint32 frameWidth, frameHeight;
    vx_df_image_e frameFormat;
    NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_WIDTH, (void *)&frameWidth, sizeof(frameWidth)) );
    NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_HEIGHT, (void *)&frameHeight, sizeof(frameHeight)) );
    NVXIO_SAFE_CALL( vxQueryImage(frame, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&frameFormat, sizeof(frameFormat)) );
    bool needScale = frameWidth != decodedImageAddr.dim_x ||
                     frameHeight != decodedImageAddr.dim_y;
    bool needConvert = frameFormat != decodedFormat;

    // config and actual image sized must be the same!
    if ((frameWidth != configuration.frameWidth) ||
            (frameHeight != configuration.frameHeight))
    {
        NVXIO_THROW_EXCEPTION("Actual image [ " << frameWidth << " x " << frameHeight <<
                              " ] is not equal to configuration one [ " << configuration.frameWidth
                              << " x " << configuration.frameHeight << " ]");
    }

    // allocate CUDA memory to copy decoded image to
    if (!is_cuda)
    {
        if (!devMem)
        {
            // we assume that decoded image will have no more than 4 channels per pixel
            NVXIO_ASSERT( cudaSuccess == cudaMallocPitch(&devMem, &devMemPitch, decodedImageAddr.dim_x * 4,
                                                         decodedImageAddr.dim_y) );
        }
    }

    // check if decoded image format has changed
    if (scaledImage)
    {
        vx_df_image_e scaledFormat;
        NVXIO_SAFE_CALL( vxQueryImage(scaledImage, VX_IMAGE_ATTRIBUTE_FORMAT, (void *)&scaledFormat, sizeof(scaledFormat)) );

        if (scaledFormat != decodedFormat)
        {
            NVXIO_SAFE_CALL( vxReleaseImage(&scaledImage) );
            scaledImage = NULL;
        }
    }

    if (needScale && !scaledImage)
    {
        scaledImage = vxCreateImage(vxContext, frameWidth, frameHeight, decodedFormat);
        NVXIO_CHECK_REFERENCE( scaledImage );
    }

    vx_image decodedImage = NULL;

    // 1. create vx_image wrapper
    if (is_cuda)
    {
        // a. create vx_image wrapper from CUDA pointer
        decodedImage = vxCreateImageFromHandle(vxContext, decodedFormat, &decodedImageAddr,
                                               &decodedPtr, NVX_IMPORT_TYPE_CUDA);
    }
    else
    {
        // a. upload decoded image to CUDA buffer
        NVXIO_ASSERT( cudaSuccess == cudaMemcpy2D(devMem, devMemPitch,
                                                  decodedPtr, decodedImageAddr.stride_y,
                                                  decodedImageAddr.dim_x * decodedImageAddr.stride_x,
                                                  decodedImageAddr.dim_y, cudaMemcpyHostToDevice) );

        // b. create vx_image wrapper for decoded buffer
        decodedImageAddr.stride_y = static_cast<vx_int32>(devMemPitch);
        decodedImage = vxCreateImageFromHandle(vxContext, decodedFormat, &decodedImageAddr,
                                               &devMem, NVX_IMPORT_TYPE_CUDA);
    }
    NVXIO_CHECK_REFERENCE( decodedImage );

    // 2. scale if necessary
    if (needScale)
    {
        // a. scale image
        NVXIO_SAFE_CALL( vxuScaleImage(vxContext, decodedImage, scaledImage, VX_INTERPOLATION_TYPE_BILINEAR) );
    }
    else
    {
        scaledImage = decodedImage;
    }

    // 3. convert / copy to dst image
    if (needConvert)
    {
        NVXIO_SAFE_CALL( vxuColorConvert(vxContext, scaledImage, frame) );
    }
    else
    {
        NVXIO_SAFE_CALL( nvxuCopyImage(vxContext, scaledImage, frame) );
    }

    if (!needScale)
        scaledImage = NULL;

    NVXIO_SAFE_CALL( vxReleaseImage(&decodedImage) );
}