Ejemplo n.º 1
0
magma_int_t magma_malloc_pinned( void** ptrPtr, size_t size )
{
    // CUDA can't allocate 0 bytes, so allocate some minimal size
    // (for pinned memory, the error is detected in free)
    if ( size == 0 )
        size = sizeof(magmaDoubleComplex);
    if ( cudaSuccess != cudaMallocHost( ptrPtr, size )) {
        return MAGMA_ERR_HOST_ALLOC;
    }
    return MAGMA_SUCCESS;
}
Ejemplo n.º 2
0
static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
  if (size < 0) THError("Invalid memory size: %ld", size);

  THCudaHostAllocator_free(ctx, ptr);

  if (size == 0) return NULL;

  THCudaCheck(cudaMallocHost(&ptr, size));

  return ptr;
}
Ejemplo n.º 3
0
float* TestClass::initData(int m, int n)
{
    float* arr;
    checkCudaErrors(cudaMallocHost(&arr, m*n*sizeof(float), cudaHostAllocDefault));
    for (int i = 0; i < m; ++i)
        for (int j = 0; j < n; ++j)
        {
            arr[j*m + i] = i + j + rand()%100/(float)3;
        }
    return arr;
}
void Segmentation_SLIC::initMemory_SLIC(){
	SLIC_input_host = new rgb[width * height];
	cudaMalloc(&SLIC_input_device, sizeof(rgb) * width * height);		//pixel map
	cudaMalloc(&SLIC_cc, sizeof(rgbxy) * Cluster_Num);							//cluster centers
	cudaMalloc(&SLIC_aggregation, sizeof(aggregationUnit) * Cluster_Num);			//cluster centers
	cudaMallocHost(&agg_host, sizeof(aggregationUnit)*Cluster_Num);
	cudaMalloc(&grd, sizeof(float) * width * height);					//gradients
	cudaMalloc(&ld, sizeof(label_distance) * width * height);			//ld pixel map
	cudaMalloc(&result_GPU, width * height * sizeof(int));
	resultMap = new int[width * height];
}
Ejemplo n.º 5
0
int main(int argc, char ** argv)
{
    size_t grid_size = WIDTH * HEIGHT * sizeof(float);

    // pedir memoria
    float * current;
    float * next;
    float * result;
    CHECK_CUDA_CALL(cudaMalloc(&current, grid_size));
    CHECK_CUDA_CALL(cudaMalloc(&next, grid_size));
    CHECK_CUDA_CALL(cudaMallocHost(&result, grid_size));

    // inicializar con la fuente de calor
    CHECK_CUDA_CALL(cudaMemset(current, 0, grid_size));
    CHECK_CUDA_CALL(cudaMemcpy(&current[idx(HEAT_X, HEAT_Y)], &HEAT_TEMP, sizeof(float), cudaMemcpyHostToDevice));
    CHECK_CUDA_CALL(cudaMemcpy(next, current, grid_size, cudaMemcpyDeviceToDevice));

    // correr las actualizaciones
    for (unsigned int step = 0; step < STEPS; ++step) {
        update_cuda(WIDTH, 1, WIDTH-1, 1, HEIGHT-1, HEAT_X, HEAT_Y, current, next);

        float * swap = current;
        current = next;
        next = swap;
    }
    CHECK_CUDA_CALL(cudaGetLastError());
    CHECK_CUDA_CALL(cudaDeviceSynchronize());

    // copiar el resultado al host para graficar
    CHECK_CUDA_CALL(cudaMemcpy(result, current, grid_size, cudaMemcpyDeviceToHost));

    // graficos
    sdls_init(WIDTH, HEIGHT);
    rgba * gfx = (rgba *) calloc(WIDTH * HEIGHT, sizeof(rgba));
    for (unsigned int y = 0; y < HEIGHT; ++y) {
        for (unsigned int x = 0; x < WIDTH; ++x) {
            gfx[idx(x, y)] = color1(result[idx(x, y)] / HEAT_TEMP);
        }
    }

    sdls_blitrectangle_rgba(0, 0, WIDTH, HEIGHT, gfx);
    sdls_draw();

    printf("Presione ENTER para salir\n");
    getchar();
    sdls_cleanup();

    CHECK_CUDA_CALL(cudaFree(current));
    CHECK_CUDA_CALL(cudaFree(next));
    CHECK_CUDA_CALL(cudaFreeHost(result));
    free(gfx);

    return 0;
}
Ejemplo n.º 6
0
// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaMallocHost(ptr, size));
    *use_cuda = true;
    return;
  }
#endif
  *ptr = malloc(size);
  *use_cuda = false;
  CHECK(*ptr) << "host allocation of size " << size << " failed";
}
int main(int argc, char **argv)
{
	// define the ptr
	int size = WIDTH * HEIGHT;
	float *h_data, *d_send_data, *d_recv_data; 
	bool use_cuda_time = true;

	if(argc != 3) {
		std::cout << "the number of paramter should be equal 2" << std::endl;
		std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl;
		return 1;
	}
	//std::cout << "debug 1" << std::endl;
	int id0 = atoi(argv[1]);
	int id1 = atoi(argv[2]);
	std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl;

	//h_data = new float[size];
	cudaMallocHost(&h_data, size*sizeof(float));
	init_data(h_data, size);

	cudaSetDevice(id0);
	cudaMalloc(&d_send_data, size*sizeof(float));
	cudaSetDevice(id1);
	cudaMalloc(&d_recv_data, size*sizeof(float));
	cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice);

	int can_access_peer_0_1, can_access_peer_1_0;
	cudaSetDevice(id0);
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1));
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0));

	if(can_access_peer_0_1 && can_access_peer_1_0) {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl;
		cudaSetDevice(id0);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0));
		cudaSetDevice(id1);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0));
	} else {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl;
	}

	cudaSetDevice(id1);
	use_cuda_time = false;
	//use_cuda_time = true;
	test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time);

	cudaFreeHost(h_data);
	cudaFree(d_send_data);
	cudaFree(d_recv_data);

	return 0;
}
PlaneFitting_FVI::PlaneFitting_FVI():
	Plane_Projector(0),
	Convert2D3D(0),
	Color_Device(cv::gpu::createContinuous(Kinect::Height, Kinect::Width, CV_8UC3))
	{
	Convert2D3D = new DimensionConvertor();
	NASP = new NormalAdaptiveSuperpixel(Kinect::Width, Kinect::Height);
	Buffer = new Buffer2D(Kinect::Width, Kinect::Height);
	JBF = new JointBilateralFilter(Kinect::Width, Kinect::Height);
	spMerger = new LabelEquivalenceSeg(Kinect::Width, Kinect::Height);
	normalEstimator = new NormalMapGenerator(Kinect::Width, Kinect::Height);
	cudaMallocHost(&Depth_Host, Kinect::Width * Kinect::Height * sizeof(float));
	cudaMalloc(&Depth_Device, Kinect::Width * Kinect::Height * sizeof(float));
	cudaMallocHost(&Points3D_Host, Kinect::Width * Kinect::Height * sizeof(float3));
	cudaMalloc(&Points3D_Device, Kinect::Width*Kinect::Height * sizeof(float3));
	Single_Kinect = new SingleKinect();
	Plane_Projector = new Projection_GPU(Kinect::Width, Kinect::Height, Single_Kinect->GetIntrinsicMatrix());
	Convert2D3D->setCameraParameters(Single_Kinect->GetIntrinsicMatrix(), Kinect::Width, Kinect::Height);
	NASP->SetParametor(15, 20, Single_Kinect->GetIntrinsicMatrix());
	normalEstimator->setNormalEstimationMethods(normalEstimator->BILATERAL);
	}
JointBilateralFilter::JointBilateralFilter(int width, int height):
	Width(width),
	Height(height),
	InputDepth(height, width),
	OutputDepth(height, width),
	smooth_Device(cv::gpu::createContinuous(height, width, CV_8UC3)),
	smooth_Host(height, width){
	SpatialFilter_Host = new float[WindowSize*WindowSize];
	cudaMalloc(&SpatialFilter_Device, sizeof(float)*WindowSize*WindowSize);
	cudaMallocHost(&Filtered_Host, sizeof(float)*Width*Height);
	cudaMalloc(&Filtered_Device, sizeof(float)*Width*Height);
	calcSpatialFilter();
	}
Ejemplo n.º 10
0
    T* pinnedAlloc(const size_t &elements)
    {
        managerInit();
        T* ptr = NULL;
        // Allocate the higher megabyte. Overhead of creating pinned memory is
        // more so we want more resuable memory.
        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
                pinnedGarbageCollect();
            }

            for(mem_iter iter = pinned_maps.begin();
                iter != pinned_maps.end(); ++iter) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    pinned_used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
                pinnedGarbageCollect();
                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, false, alloc_bytes};
            pinned_maps[ptr] = info;
            pinned_used_bytes += alloc_bytes;
        }
        return (T*)ptr;
    }
Ejemplo n.º 11
0
/** Documented at declaration */
void
gpujpeg_image_convert(const char* input, const char* output, struct gpujpeg_image_parameters param_image_from,
                      struct gpujpeg_image_parameters param_image_to)
{
    assert(param_image_from.width == param_image_to.width);
    assert(param_image_from.height == param_image_to.height);
    assert(param_image_from.comp_count == param_image_to.comp_count);

    // Load image
    int image_size = gpujpeg_image_calculate_size(&param_image_from);
    uint8_t* image = NULL;
    if ( gpujpeg_image_load_from_file(input, &image, &image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to load image [%s]!\n", input);
        return;
    }

    struct gpujpeg_coder coder;
    gpujpeg_set_default_parameters(&coder.param);
    coder.param.color_space_internal = GPUJPEG_RGB;

    // Initialize coder and preprocessor
    coder.param_image = param_image_from;
    assert(gpujpeg_coder_init(&coder) == 0);
    assert(gpujpeg_preprocessor_encoder_init(&coder) == 0);
    // Perform preprocessor
    assert(cudaMemcpy(coder.d_data_raw, image, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
    assert(gpujpeg_preprocessor_encode(&coder) == 0);
    // Save preprocessor result
    uint8_t* buffer = NULL;
    assert(cudaMallocHost((void**)&buffer, coder.data_size * sizeof(uint8_t)) == cudaSuccess);
    assert(buffer != NULL);
    assert(cudaMemcpy(buffer, coder.d_data, coder.data_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
    // Deinitialize decoder
    gpujpeg_coder_deinit(&coder);

    // Initialize coder and postprocessor
    coder.param_image = param_image_to;
    assert(gpujpeg_coder_init(&coder) == 0);
    assert(gpujpeg_preprocessor_decoder_init(&coder) == 0);
    // Perform postprocessor
    assert(cudaMemcpy(coder.d_data, buffer, coder.data_size * sizeof(uint8_t), cudaMemcpyHostToDevice) == cudaSuccess);
    assert(gpujpeg_preprocessor_decode(&coder) == 0);
    // Save preprocessor result
    assert(cudaMemcpy(coder.data_raw, coder.d_data_raw, coder.data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost) == cudaSuccess);
    if ( gpujpeg_image_save_to_file(output, coder.data_raw, coder.data_raw_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to save image [%s]!\n", output);
        return;
    }
    // Deinitialize decoder
    gpujpeg_coder_deinit(&coder);
}
 void allocateColorField(int volume, QudaPrecision prec, bool usePinnedMemory, void*& field)
 {
   const int realSize = getRealSize(prec);
   int siteSize = 18;
   if(usePinnedMemory){
     cudaMallocHost((void**)&field, volume*siteSize*realSize);
   }else{
     field = (void*)malloc(volume*siteSize*realSize);
   }
   if(field == NULL){
     errorQuda("ERROR: allocateColorField failed\n");
   }
   return;
 }
Ejemplo n.º 13
0
int magma_solve ( int *dA_dim, int *lWork, double2 *A, int *ipiv, int *N ){

	// Check inputs
	//
	fprintf (stderr, "Using MAGMA solve\n" );
	fprintf (stderr, "	dA_dim: %i\n", *dA_dim );
	fprintf (stderr, "	N: %i\n", *N );
	fprintf (stderr, "	lWork: %i\n", *lWork );

	cuInit(0);
	cublasInit();
	printout_devices();

	cublasStatus status;

	double2 *d_A, *work;
	status = cublasAlloc ( *dA_dim, sizeof(double2), (void**)&d_A );

	if ( status != CUBLAS_STATUS_SUCCESS ){
			fprintf (stderr, "ERROR: device memory allocation error (d_A)\n" );
			fprintf (stderr, "ERROR: dA_dim: %i\n", dA_dim );
	}

	cudaError_t err;
	err = cudaMallocHost ( (void**)&work, *lWork * sizeof(double2) );

	if(err != cudaSuccess){
		fprintf (stderr, "ERROR: cudaMallocHost error (work)\n" );
	}

	int info[1];
	TimeStruct start, end;

	start = get_current_time ();
	magma_zgetrf ( N, N, A, N, ipiv, work, d_A, info );
	end = get_current_time ();

	double gpu_perf;
	gpu_perf = 4.*2.*(*N)*(*N)*(*N)/(3.*1000000*GetTimerValue(start,end));

	if ( info[0] != 0 ){
			fprintf (stderr, "ERROR: magma_zgetrf failed\n" );
	}

	printf("	GPU performance: %6.2f GFlop/s\n", gpu_perf);

	int stat = 0;
	return stat;

}
Ejemplo n.º 14
0
    static void 
      fetchOprodFromGPUArraysQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh)
      {
        float2 *packedEven, *packedOdd;
        if(cudaMallocHost(&packedEven,bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedOdd\n");
	}


        cudaMemcpy(packedEven, cudaOprodEven, bytes, cudaMemcpyDeviceToHost);
        checkCudaError();
        cudaMemcpy(packedOdd, cudaOprodOdd, bytes, cudaMemcpyDeviceToHost);
        checkCudaError();

        unpackOprodField((float*)cpuOprod, packedEven, 0, Vh);
        unpackOprodField((float*)cpuOprod, packedOdd,  1, Vh);

        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
Ejemplo n.º 15
0
static void *nv_malloc(unsigned long n)
{
	void *mem;
#if (NV_ENABLE_CUDA && NV_GPU_PIN_MALLOC)
	if (nv_gpu_available()) {
		cudaMallocHost(&mem, n);
	} else {
		mem = malloc(n);
	}
#else
	mem = malloc(n);
#endif
	return mem;
}
Ejemplo n.º 16
0
void load_bodies()
{
	fscanf(file,"%f %d %d\n",&TIME, &XCOE, &VCOE);
	fscanf(file,"%d\n",&N);
	
	cudaMallocHost((void **)&X,  N * sizeof(float4));
	cudaMallocHost((void **)&V,  N * sizeof(float4));
	
	for(int i=0;i<N;i++) {
		fscanf(file, "%f %f",&X[i].w, &V[i].w);
		fscanf(file, "%f %f %f",&X[i].x, &X[i].y, &X[i].z, &X[i].w);
		fscanf(file, "%f %f %f",&V[i].x, &V[i].y, &V[i].z);
	}
	
	for(int i=0;i<N;i++) {
		X[i].x *= XCOE;
		X[i].y *= XCOE;
		X[i].z *= XCOE;
		V[i].x *= VCOE;
		V[i].y *= VCOE;
		V[i].z *= VCOE;
	}
}
Ejemplo n.º 17
0
/** Documented at declaration */
struct gpujpeg_opengl_texture*
gpujpeg_opengl_texture_register(int texture_id, enum gpujpeg_opengl_texture_type texture_type)
{
    struct gpujpeg_opengl_texture* texture = NULL;
    cudaMallocHost((void**)&texture, sizeof(struct gpujpeg_opengl_texture));
    assert(texture != NULL);

    texture->texture_id = texture_id;
    texture->texture_type = texture_type;
    texture->texture_width = 0;
    texture->texture_height = 0;
    texture->texture_pbo_id = 0;
    texture->texture_pbo_type = 0;
    texture->texture_pbo_resource = 0;
    texture->texture_callback_param = NULL;
    texture->texture_callback_attach_opengl = NULL;
    texture->texture_callback_detach_opengl = NULL;

#ifdef GPUJPEG_USE_OPENGL
    glBindTexture(GL_TEXTURE_2D, texture->texture_id);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &texture->texture_width);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &texture->texture_height);
    glBindTexture(GL_TEXTURE_2D, 0);
    assert(texture->texture_width != 0 && texture->texture_height != 0);

    // Select PBO type
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
        texture->texture_pbo_type = GL_PIXEL_PACK_BUFFER;
    } else if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
        texture->texture_pbo_type = GL_PIXEL_UNPACK_BUFFER;
    } else {
        assert(0);
    }

    // Create PBO
    glGenBuffers(1, &texture->texture_pbo_id);
    glBindBuffer(texture->texture_pbo_type, texture->texture_pbo_id);
    glBufferData(texture->texture_pbo_type, texture->texture_width * texture->texture_height * 3 * sizeof(uint8_t), NULL, GL_DYNAMIC_DRAW);
    glBindBuffer(texture->texture_pbo_type, 0);

    // Create CUDA PBO Resource
    cudaGraphicsGLRegisterBuffer(&texture->texture_pbo_resource, texture->texture_pbo_id, cudaGraphicsMapFlagsNone);
    gpujpeg_cuda_check_error("Register OpenGL buffer");
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    return texture;
}
Ejemplo n.º 18
0
/**
 * \brief Creates and initializes the working data for the plan
 * \param [in] plan The data and memory location for the plan.
 * \return int Error flag value
 * \sa parseCUDAMEMPlan 
 * \sa makeCUDAMEMPlan
 * \sa execCUDAMEMPlan
 * \sa perfCUDAMEMPlan
 * \sa killCUDAMEMPlan
*/
int   initCUDAMEMPlan(void *plan) {
	size_t avail, total, arraybytes;
	int M,i;
	int ret = make_error(ALLOC,generic_err);
	double gputhreads;
	cudaError_t cudaStat;
	struct cudaDeviceProp prop;
	Plan *p;
	CUDAMEMdata *d = NULL;
	p = (Plan *)plan;
	if (p) {
		d = (CUDAMEMdata*)p->vptr;
		p->exec_count = 0;
		perftimer_init(&p->timers, NUM_TIMERS);
	}
	if(d) {
		CUDA_CALL( cudaSetDevice(d->device) );
		CUDA_CALL( cudaMemGetInfo(&avail, &total) );
		CUDA_CALL( cudaGetDeviceProperties(&prop, d->device) );
		if (d->nGpuThreads != 0) {	// use the user spec'd number of threads or default to warp*cores
			gputhreads = (double)(d->nGpuThreads);
		} else {
			gputhreads = d->nGpuThreads = prop.warpSize * prop.multiProcessorCount;
		}
		if (prop.major < 2) {	// check results on older devices
			d->resultCheck = 1;
		} else {
			d->resultCheck = 0;
		}
		// calculate M for 6 M*M arrays to fill 100%/75%/50% of GPU free memory 
		// M = (d->nGpuThreads) * (int)(sqrt(0.75*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		// M = (d->nGpuThreads) * (int)(sqrt(0.50*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		M = (d->nGpuThreads) * (int)(sqrt(1.00*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		// assume one will fit in host memory
		d->M = M;
		arraybytes = (size_t)(0.99*avail);
		d->arraybytes = arraybytes;
                d->arrayelems = arraybytes / sizeof(int);
		// host array and device arrays
		CUDA_CALL( cudaMallocHost((void **)(&(d->hostarray)), arraybytes) );
		CUDA_CALL( cudaMalloc    ((void **)(&(d->devicearray)), arraybytes) );
		// initialize so that results are M*PI**2/100
		//for(i=0; i<3*M*M; i++) d->HA[i] = (double)0.31415926535;
		//CUDA_CALL( cudaMemcpy( (d->DA), (d->HA), arraybytes, cudaMemcpyHostToDevice) );
		//CUDA_CALL( cudaMemcpy( (d->DB), (d->DA), arraybytes, cudaMemcpyDeviceToDevice) );
		ret = ERR_CLEAN;
	}
	return ret;
}
Ejemplo n.º 19
0
CudaGridMap::CudaGridMap(const Vec3i &numGridPoints, const Vec3i &numGridPointsPadded, const double *inputEnergies, cudaStream_t stream)
    : stream(stream), numGridPoints(numGridPoints), numGridPointsPadded(numGridPointsPadded)
{
    // Allocate the padded grid in global memory
    CUDA_SAFE_CALL(cudaMalloc((void**)&energiesDevice, sizeof(float) * numGridPointsPadded.Cube()));

    // Convert doubles to floats and save them in page-locked memory
    int numGridPointsPerMap = numGridPoints.Cube();
    CUDA_SAFE_CALL(cudaMallocHost((void**)&energiesHost, sizeof(float) * numGridPointsPerMap));
    std::transform(inputEnergies, inputEnergies + numGridPointsPerMap, energiesHost, typecast<float, double>);

    // Copy the initial energies from the original grid to the padded one in global memory
    // Elements in the area of padding will stay uninitialized
    copyGridMapPadded(energiesDevice, numGridPointsPadded, energiesHost, numGridPoints, cudaMemcpyHostToDevice);
}
Ejemplo n.º 20
0
cv::Mat gpuResize(cv::Mat& img, cv::Size newSize)
{
    #ifdef USE_CUDA
        // Upload to Source to GPU
        float* cpuPtr = &img.at<float>(0);
        float* gpuPtr;
        cudaMallocHost((void **)&gpuPtr, img.size().width * img.size().height * sizeof(float));
        cudaMemcpy(gpuPtr, cpuPtr, img.size().width * img.size().height * sizeof(float),
                   cudaMemcpyHostToDevice);

        // Upload to Dest to GPU
        cv::Mat newImg = cv::Mat(newSize,CV_32FC1,cv::Scalar(0));
        float* newCpuPtr = &newImg.at<float>(0);
        float* newGpuPtr;
        cudaMallocHost((void **)&newGpuPtr, newSize.width * newSize.height * sizeof(float));
        cudaMemcpy(newGpuPtr, newCpuPtr, newSize.width * newSize.height * sizeof(float),
                   cudaMemcpyHostToDevice);

        std::vector<const float*> sourcePtrs;
        sourcePtrs.emplace_back(gpuPtr);
        std::array<int, 4> targetSize = {1,1,newImg.size().height,newImg.size().width};
        std::array<int, 4> sourceSize = {1,1,img.size().height,img.size().width};
        std::vector<std::array<int, 4>> sourceSizes;
        sourceSizes.emplace_back(sourceSize);
        op::resizeAndMergeGpu(newGpuPtr, sourcePtrs, targetSize, sourceSizes);
        cudaMemcpy(newCpuPtr, newGpuPtr, newImg.size().width * newImg.size().height * sizeof(float),
                   cudaMemcpyDeviceToHost);

        cudaFree(gpuPtr);
        cudaFree(newGpuPtr);
        return newImg;
    #else
        op::error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
              " this functionality.", __LINE__, __FUNCTION__, __FILE__);
    #endif
}
Ejemplo n.º 21
0
/** Documented at declaration */
void
gpujpeg_component_print16(struct gpujpeg_component* component, int16_t* d_data)
{
    int data_size = component->data_width * component->data_height;
    int16_t* data = NULL;
    cudaMallocHost((void**)&data, data_size * sizeof(int16_t));
    cudaMemcpy(data, d_data, data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);

    printf("Print Data\n");
    for ( int y = 0; y < component->data_height; y++ ) {
        for ( int x = 0; x < component->data_width; x++ ) {
            printf("%3d ", data[y * component->data_width + x]);
        }
        printf("\n");
    }
    cudaFreeHost(data);
}
Ejemplo n.º 22
0
TEST(Memset, MallocAfterMemset) {
    cudaError_t ret;
    void *ptr1, *ptr2;
    const size_t block = 1 << 10;

    ret = cudaMalloc(&ptr1, block);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemset(ptr1, 0, block);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMalloc(&ptr2, block);
    ASSERT_EQ(cudaSuccess, ret);

    // Download data
    void *hptr1;
    ret = cudaMallocHost(&hptr1, block);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpy(hptr1, ptr1, block, cudaMemcpyDeviceToHost);

    // Copy out validity bits
    uint8_t * vptr1 = new uint8_t[block];
    int valgrind = VALGRIND_GET_VBITS(hptr1, vptr1, block);
    assert(valgrind == 0 || valgrind == 1);

    // Check if Valgrind is running
    if (valgrind == 1) {
        uint8_t * eptr1 = new uint8_t[block];
        memset(eptr1, 0x0, block);

        EXPECT_EQ(0, memcmp(vptr1, eptr1, block));
        delete[] eptr1;
    }

    delete[] vptr1;

    ret = cudaFree(ptr2);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(ptr1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(hptr1);
    ASSERT_EQ(cudaSuccess, ret);
}
Ejemplo n.º 23
0
CudaFloatTexture1D::CudaFloatTexture1D(int width, const double *data, CudaAction action, cudaStream_t stream, CudaInternalAPI *api)
{
    channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

    // Allocate the texture on the GPU...
    CUDA_SAFE_CALL(cudaMallocArray(&deviceArray, &channelDesc, width, 1));
    // ... and in page-locked system memory
    CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, sizeof(float) * width));

    // Convert doubles to floats and save them to page-locked system memory
    std::transform(data, data + width, hostMem, typecast<float, double>);

    // Copy floats from the page-locked memory to the GPU
    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(deviceArray, 0, 0, hostMem, sizeof(float) * width, cudaMemcpyHostToDevice, stream));

    if (action == BindToKernel)
        api->setDistDepDielTexture(deviceArray, &channelDesc);
}
Ejemplo n.º 24
0
  void
  host_image2d<V>::allocate(const domain_type& d, unsigned border, bool pinned)
  {
    V* ptr = 0;

#ifndef NO_CUDA
    if (pinned)
    {
      //cudaMallocHost(&ptr, domain_.nrows() * domain_.ncols() * sizeof(V));
      cudaMallocHost(&ptr, (domain_.nrows() + 2 * border) * (domain_.ncols() + 2 * border) * sizeof(V));
      pitch_ = domain_.ncols() * sizeof(V);
      data_ = boost::shared_ptr<V>(ptr, cudaFreeHost);
    }
    else
#endif
    {
      pitch_ = 0;
      pitch_ = (d.ncols() + 2 * border) * sizeof(V);
      if (pitch_ % 4)
       	pitch_ = pitch_ + 4 - (pitch_ & 3);
      ptr = (V*) new char[(d.nrows() + 2 * border) * pitch_ + 64];
      data_ = boost::shared_ptr<V>(ptr, array_free<V>);
      // data_ = boost::shared_ptr<V>(ptr, [&] (V* ptr) {
      // 	  for (unsigned r = 0; r < this->nrows(); r++)
      // 	    for (unsigned c = 0; c < this->ncols(); c++)
      // 	      this->operator()(r, c).~V();
      // 	  delete [] (char*)ptr;
      // 	});
      //data_ = boost::shared_ptr<V>(ptr, free_array_of_object<V>(*this));
      // assert(!(size_t(begin_) % 64));
      // assert(!(size_t(pitch_) % 64));
    }

    begin_ = data_.get() + (border * pitch_) / sizeof(V) + border;
    // if (size_t(begin_) % 64)
    //   begin_ = begin_ + 64 - (size_t(begin_) % 64);

    // for (unsigned r = 0; r < nrows(); r++)
    //   for (unsigned c = 0; c < ncols(); c++)
    // 	new(&this->operator()(r, c)) V();

    assert(begin_);
  }
Ejemplo n.º 25
0
void reg_f3d_gpu<T>::AllocateWarped()
{
#ifndef NDEBUG
    printf("[NiftyReg DEBUG] reg_f3d_gpu<T>::AllocateWarped called.\n");
#endif
    if(this->currentReference==NULL){
        printf("[NiftyReg ERROR] Error when allocating the warped image.\n");
        exit(1);
    }
    this->ClearWarped();
    this->warped = nifti_copy_nim_info(this->currentReference);
    this->warped->dim[0]=this->warped->ndim=this->currentFloating->ndim;
    this->warped->dim[4]=this->warped->nt=this->currentFloating->nt;
    this->warped->pixdim[4]=this->warped->dt=1.0;
    this->warped->nvox = this->warped->nx *
                        this->warped->ny *
                        this->warped->nz *
                        this->warped->nt;
    this->warped->datatype = this->currentFloating->datatype;
    this->warped->nbyper = this->currentFloating->nbyper;
    NR_CUDA_SAFE_CALL(cudaMallocHost(&(this->warped->data), this->warped->nvox*this->warped->nbyper))
    if(this->warped->nt==1){
        if(cudaCommon_allocateArrayToDevice<float>(&this->warped_gpu, this->warped->dim)){
            printf("[NiftyReg ERROR] Error when allocating the warped image.\n");
            exit(1);
        }
    }
    else if(this->warped->nt==2){
        if(cudaCommon_allocateArrayToDevice<float>(&this->warped_gpu, &this->warped2_gpu, this->warped->dim)){
            printf("[NiftyReg ERROR] Error when allocating the warped image.\n");
            exit(1);
        }
    }
    else{
        printf("[NiftyReg ERROR] reg_f3d_gpu does not handle more than 2 time points in the floating image.\n");
        exit(1);
    }
#ifndef NDEBUG
    printf("[NiftyReg DEBUG] reg_f3d_gpu<T>::AllocateWarped done.\n");
#endif
    return;
}
Ejemplo n.º 26
0
/**
 * @return maximal buffer size needed to image with such a properties
 */
int jpeg_to_dxt_decompress_reconfigure_real(void *state, struct video_desc desc,
        int rshift, int gshift, int bshift, int pitch, codec_t out_codec, int i)
{
    UNUSED(rshift);
    UNUSED(gshift);
    UNUSED(bshift);
    struct state_decompress_jpeg_to_dxt *s = (struct state_decompress_jpeg_to_dxt *) state;

    assert(out_codec == DXT1 || out_codec == DXT5);
    assert(pitch == (int) desc.width / s->ppb); // default for DXT1

    free(s->input[i]);
    free(s->output[i]);

    if(s->jpeg_decoder[i] != NULL) {
        gpujpeg_decoder_destroy(s->jpeg_decoder[i]);
    } else {
        gpujpeg_init_device(cuda_devices[i], 0);
    }

    if(s->dxt_out_buff[i] != NULL) {
        cudaFree(s->dxt_out_buff[i]);
    }

    if(cudaSuccess != cudaMallocHost((void **) &s->dxt_out_buff[i], desc.width * desc.height / s->ppb)) {
        fprintf(stderr, "Could not allocate CUDA output buffer.\n");
        return 0;
    }
    //gpujpeg_init_device(cuda_device, GPUJPEG_OPENGL_INTEROPERABILITY);

    s->jpeg_decoder[i] = gpujpeg_decoder_create();
    if(!s->jpeg_decoder[i]) {
        fprintf(stderr, "Creating JPEG decoder failed.\n");
        return 0;
    }

    s->input[i] = malloc(desc.width * desc.height);
    s->output[i] = malloc(desc.width / s->ppb * desc.height);

    return desc.width * desc.height;
}
Ejemplo n.º 27
0
static void retrieveGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order,
			       QudaReconstructType reconstruct, int bytes, int volumeCB, int pad) {

  // Use pinned memory
  FloatN *packed;
  FloatN *packedEven = packed;
  FloatN *packedOdd = (FloatN*)((char*)packed + bytes/2);
    
  cudaMallocHost((void**)&packed, bytes);
    
  cudaMemcpy(packed, gauge, bytes, cudaMemcpyDeviceToHost);
    
  if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
    unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
    unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
  } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
    unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
    unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
  } else {
    errorQuda("Invalid gauge_order");
  }
    
  cudaFreeHost(packed);
}
Ejemplo n.º 28
0
T*
Pool<T>::get( const MemoryType mtype, const size_t _n )
{
  assert( !destroyed_ );

  if( pad_==0 ) pad_ = _n/10;
  size_t n = _n+pad_;

  switch(mtype) {
  case LOCAL_RAM: {
    T* field = NULL;
    typename FQSizeMap::iterator ifq = cpufqm_.lower_bound( n );
    if( ifq == cpufqm_.end() ){
      ifq = cpufqm_.insert( ifq, make_pair(n,FieldQueue()) );
    }
    else{
      n = ifq->first;
    }
    FieldQueue& fq = ifq->second;
    if( fq.empty() ){
      ++cpuhighWater_;
      try{
#ifdef ENABLE_CUDA
        /* Pinned Memory Mode
         * As the Pinned memory allocation and deallocation has higher overhead
         * this operation is performed at memory pool level which is created
         * and destroyed only once.
         */
        cudaError_t err;
        if (cudaSuccess != (err = cudaMallocHost((void**)&field, n*sizeof(T)))) {
          std::ostringstream msg;
          msg << "WARNING : Pinned Memory allocation failed , at " << __FILE__ << " : " << __LINE__
              << std::endl;
          msg << "\t - " << cudaGetErrorString(err);
          msg << "Allocating Pageable memory instead. \n";
          field = new T[n];
          pinned_ = false;
        }
#else
        // Pageable Memory mode
        field = new T[n];
#endif
      }
      catch(std::runtime_error& e){
        std::cout << "Error occurred while allocating memory on LOCAL_RAM" << std::endl
                  << e.what() << std::endl
                  << __FILE__ << " : " << __LINE__ << std::endl;
      }
      fsm_[field] = n;
    }
    else{
      field = fq.top(); fq.pop();
    }
    return field;
  }
# ifdef ENABLE_CUDA
  case EXTERNAL_CUDA_GPU: {
    T* field = NULL;
    typename FQSizeMap::iterator ifq = gpufqm_.lower_bound( n );
    if( ifq == gpufqm_.end() ){
      ifq = gpufqm_.insert( ifq, make_pair(n,FieldQueue()) );
    }
    else{
      n = ifq->first;
    }
    FieldQueue& fq = ifq->second;
    if( fq.empty() ) {
      ++gpuhighWater_;
      ema::cuda::CUDADeviceInterface& CDI = ema::cuda::CUDADeviceInterface::self();
      field = (T*)CDI.get_raw_pointer( n * sizeof(T), deviceIndex_ );
      fsm_[field] = n;
    }
    else{
      field = fq.top(); fq.pop();
    }
    return field;
  }
# endif
  default: {
    std::ostringstream msg;
      msg << "Attempt to get unsupported memory pool ( "
          << DeviceTypeTools::get_memory_type_description(mtype)
          << " ) \n";
      msg << "\t " << __FILE__ << " : " << __LINE__;
      throw(std::runtime_error(msg.str()));
  }
  } //switch
}
Ejemplo n.º 29
0
TEST(Transformation, Bin)
{
	for(int i = 11; i < 12; i++)
	{	
		cudaDeviceReset();

		srand(i);
		int size = (1<<i);
		int batch = 1;
		int bincount = 5;

		tfloat* h_input = (tfloat*)malloc(size * size * batch * sizeof(tfloat));
		for(int b = 0; b < batch; b++)
		{
			for(int j = 0; j < size * size; j++)
				h_input[b * size * size + j] = (tfloat)(j % (1<<bincount));
		}
		tfloat* d_input = (tfloat*)CudaMallocFromHostArray(h_input, size * size * batch * sizeof(tfloat));

		tfloat* d_result;
		cudaMalloc((void**)&d_result, size * size / (1<<(bincount * 2)) * sizeof(tfloat));

		int3 dims;
		dims.x = size;
		dims.y = size;
		d_Bin(d_input, d_result, dims, bincount, 1);

		tfloat* h_result = (tfloat*)MallocFromDeviceArray(d_result, size * size / (1<<(bincount * 2)) * sizeof(tfloat));

		ASSERT_ARRAY_EQ(h_result, (tfloat)((1<<bincount) - 1) / (tfloat)2, size * size / (1<<(bincount * 2)));

		cudaFree(d_input);
		cudaFree(d_result);
		free(h_input);
		free(h_result);

		cudaDeviceReset();
	}

	for(int i = 9; i < 10; i++)
	{	
		cudaDeviceReset();

		srand(i);
		size_t size = (1<<i);
		size_t batch = 1;
		size_t bincount = 2;

		tfloat* h_input;
		cudaMallocHost((void**)&h_input, size * size * size * batch * sizeof(tfloat), 0);
		for(int b = 0; b < batch; b++)
		{
			for(int j = 0; j < size * size * size; j++)
				h_input[b * size * size * size + j] = (tfloat)(j % (1<<bincount));
		}
		tfloat* d_input = (tfloat*)CudaMallocFromHostArray(h_input, size * size * size * batch * sizeof(tfloat));

		tfloat* d_result;
		cudaMalloc((void**)&d_result, size * size * size / (1<<(bincount * 3)) * batch * sizeof(tfloat));

		int3 dims;
		dims.x = size;
		dims.y = size;
		dims.z = size;
		d_Bin(d_input, d_result, dims, bincount, batch);

		tfloat* h_result = (tfloat*)MallocFromDeviceArray(d_result, size * size * size / (1<<(bincount * 3)) * batch * sizeof(tfloat));

		ASSERT_ARRAY_EQ(h_result, (tfloat)((1<<bincount) - 1) / (tfloat)2, size * size / (1<<(bincount * 2)));

		cudaFreeHost(h_input);
		free(h_result);
		cudaFree(d_input);
		cudaFree(d_result);

		cudaDeviceReset();
	}
}
Ejemplo n.º 30
0
magma_int_t
magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){


    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
     (cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("Two or more Tesla(s) with (SM 2.0)"
                        " class GPUs are required for P2P.\n");
    }

    // Query device properties
    cudaDeviceProp prop[64];
    int gpuid_tesla[64]; // find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++) {
         (cudaGetDeviceProperties(&prop[i], i));
        // Only Tesla boards based on Fermi can support P2P
        {
            // This is an array of P2P capable GPUs
            gpuid_tesla[gpu_count++] = i;
        }
    }
    *num_gpus=gpu_n;

     for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
  // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
    int can_access_peer_0_1, can_access_peer_1_0;
    // In this case we just pick the first two that we can support
     (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], 
                                                        gpuid_tesla[j]));
     (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], 
                                                        gpuid_tesla[i]));


    // Output results from P2P capabilities
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],                                                                  
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j] ,
                             can_access_peer_0_1 ? "Yes" : "No");
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j],
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],
                            can_access_peer_1_0 ? "Yes" : "No");

    if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
    {
        printf("Two or more Tesla(s) with class"
                " GPUs are required for P2P to run.\n");
        printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between"
        " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]);
        printf("PASSED\n");
        //exit(EXIT_SUCCESS);
    }
     }    
    }

  // Enable peer access
     for(int i=0; i<gpu_n; i++)
     {
         for(int j=i+1; j<gpu_n; j++)
         {
             printf("Enabling peer access between GPU%d and GPU%d...\n",
                gpuid_tesla[i], gpuid_tesla[j]);
              (cudaSetDevice(gpuid_tesla[i]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0));
              (cudaSetDevice(gpuid_tesla[j]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0));
           magma_dcheckerr("P2P");
         }
     }

   magma_dcheckerr("P2P successful");


    // Enable peer access
    for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Check that we got UVA on both devices
    printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 
    gpuid_tesla[i], gpuid_tesla[j]);
    //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && 
    //                            prop[gpuid_tesla[j]].unifiedAddressing);

    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, 
    gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") );
    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, 
    gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") );

        }
    }



  if(*bw_bmark==1){


    // P2P memcopy() benchmark
   for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", 
                int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]);
    (cudaSetDevice(gpuid_tesla[i]));
    float* g0;
    (cudaMalloc(&g0, buf_size));
    (cudaSetDevice(gpuid_tesla[j]));
    float* g1;
    (cudaMalloc(&g1, buf_size));
    float* h0;
    (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float time_memcpy;
    int eventflags = cudaEventBlockingSync;
    (cudaEventCreateWithFlags(&start_event, eventflags));
    (cudaEventCreateWithFlags(&stop_event, eventflags));


    (cudaEventRecord(start_event, 0));
    for (int k=0; k<100; k++)
    {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
            
        // Ping-pong copy between GPUs
        if (k % 2 == 0)
            (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        else
            (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
    (cudaEventRecord(stop_event, 0));
    (cudaEventSynchronize(stop_event));
    (cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between"
            "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j],
        (1.0f / (time_memcpy / 1000.0f)) 
            * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);


     // Cleanup and shutdown
    printf("Cleanup of P2P benchmark...\n");
    (cudaEventDestroy(start_event));
    (cudaEventDestroy(stop_event));
    (cudaSetDevice(gpuid_tesla[i]));
    (magma_free( g0) );
    (cudaSetDevice(gpuid_tesla[j]));
    (magma_free( g1) );
    (magma_free_cpu( h0) );

    }
    }

    // host-device memcopy() benchmark

        for(int j=0; j<gpu_n; j++)
        {
    cudaSetDevice(gpuid_tesla[j]);

    int *h_data_source;
    int *h_data_sink;

    int *h_data_in[STREAM_COUNT];
    int *d_data_in[STREAM_COUNT];

    int *h_data_out[STREAM_COUNT];
    int *d_data_out[STREAM_COUNT];


    cudaEvent_t cycleDone[STREAM_COUNT];
    cudaStream_t stream[STREAM_COUNT];

    cudaEvent_t start, stop;

    // Allocate resources
    int memsize;
    memsize = 1000000 * sizeof(int);

    h_data_source = (int*) malloc(memsize);
    h_data_sink = (int*) malloc(memsize);    

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        ( cudaHostAlloc(&h_data_in[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_in[i], memsize) );
        
        ( cudaHostAlloc(&h_data_out[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_out[i], memsize) );

        
        ( cudaStreamCreate(&stream[i]) );
        ( cudaEventCreate(&cycleDone[i]) ); 
        
        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start); cudaEventCreate(&stop);

    

    // Time host-device copies
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, 
        cudaMemcpyHostToDevice,0) );
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_h2d_time;    
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, 
        cudaMemcpyDeviceToHost, 0) );        
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_d2h_time;    
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    
    cudaEventSynchronize(stop);
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j,
        memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time );
    printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j,
        memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);

    // Free resources

    free( h_data_source );
    free( h_data_sink );

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        magma_free_cpu( h_data_in[i] );
        magma_free( d_data_in[i] );

        magma_free_cpu( h_data_out[i] );
        magma_free( d_data_out[i] );
        
        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);        
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
   }

  }//end if-loop bandwidth_benchmark

    magma_dcheckerr("P2P established");

    return MAGMA_SUCCESS;

}