const bool CUDARunner::AllocateResources(const int numb, const int numt)
{
	bool allocated=true;
	CUresult rval;
	DeallocateResources();

	m_in=(cuda_in *)malloc(sizeof(cuda_in));
	m_out=(cuda_out *)malloc(numb*numt*sizeof(cuda_out));

	rval=cuMemAlloc(&m_devin,sizeof(cuda_in));
	if(rval!=CUDA_SUCCESS)
	{
		printf("Error %d allocating CUDA memory\n",rval);
		m_devin=0;
		allocated=false;
	}
	rval=cuMemAlloc(&m_devout,numb*numt*sizeof(cuda_out));
	if(rval!=CUDA_SUCCESS)
	{
		printf("Error %d allocating CUDA memory\n",rval);
		m_devout=0;
		allocated=false;
	}

	printf("Done allocating CUDA resources for (%d,%d)\n",numb,numt);
	return allocated;
}
Esempio n. 2
0
void gpu_transpose_naive(int *dest, const int *src, int height, int width) {
    assert((width & (width - 1)) == 0);  // TODO
    assert((height & (height - 1)) == 0);

    cuda->set_default_module("transpose.ptx");
    CUfunction transpose_kernel = cuda->get_kernel("transpose_naive");

    int grid_dim_x = width / BLOCK_DIM_X;
    int grid_dim_y = height / BLOCK_DIM_Y;

    CUdeviceptr device_src;
    CUdeviceptr device_dest;
    cuMemAlloc(&device_src, width*height*sizeof(int));
    cuMemAlloc(&device_dest, width*height*sizeof(int));
    cuMemcpyHtoD(device_src, src, width*height*sizeof(int));

    void *args[] = {&device_dest, &device_src, &height, &width};
    cuda->launch_kernel_2d_sync(transpose_kernel,
            grid_dim_x, grid_dim_y,
            BLOCK_DIM_X, BLOCK_DIM_Y,
            args);

    cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
    cuMemFree(device_src);
    cuMemFree(device_dest);
    cuda->ctx_synchronize();
}
Esempio n. 3
0
File: ov.c Progetto: CPFL/gtraffic
/*
 * get device memory
 */
void
get_dev_mem(void){

  res = cuMemAlloc(&x_dev, N * sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemAlloc(x) failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuMemAlloc(&v_dev, N * sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemAlloc(v) failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuMemAlloc(&error_dev, sizeof(int));
  if(res != CUDA_SUCCESS){
    printf("cuMemAlloc(error) failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuMemAlloc(&s_time_dev, sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemAlloc(s_time) failed: res = %s\n", conv(res));
    exit(1);
  }
  
}
Esempio n. 4
0
void gpu_transpose_with_shared_mem(int *dest, const int *src, int height, int width) {
    assert((width & (width - 1)) == 0);  // TODO
    assert((height & (height - 1)) == 0);

    cuda->set_default_module(CUDA_PTX_PREFIX"transpose.cu.ptx");
    CUfunction transpose_kernel = cuda->get_kernel("transpose_with_shared_mem");

    int grid_dim_x = width / TILE_DIM;
    int grid_dim_y = height / TILE_DIM;

    CUdeviceptr device_src;
    CUdeviceptr device_dest;
    cuMemAlloc(&device_src, width*height*sizeof(int));
    cuMemAlloc(&device_dest, width*height*sizeof(int));
    cuMemcpyHtoD(device_src, src, width*height*sizeof(int));

    void *args[] = {&device_dest, &device_src};
    cuda->launch_kernel_2d_sync(transpose_kernel,
            grid_dim_x, grid_dim_y,
            TILE_DIM, 2,
            args);

    cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
    cuMemFree(device_src);
    cuMemFree(device_dest);
    cuda->ctx_synchronize();
}
Esempio n. 5
0
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
Esempio n. 6
0
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
Esempio n. 8
0
void *swanMalloc( size_t len ) {
	void *ptr;
	CUdeviceptr dptr;
	CUresult err;
	try_init();
	if( len == 0 ) {
//		printf("SWAN: WARNING - swnaMalloc() called with 0\n");
		return NULL;
	}

	err = cuMemAlloc( (CUdeviceptr*) &dptr, len );
	
	ptr = (void*)dptr;

	if ( err != CUDA_SUCCESS ) {
		printf("Attempted to allocate %lu bytes (%lu already allocated)\n", len, state.bytes_allocated );
	abort();
		error("swanMalloc failed\n" );
	}
	state.bytes_allocated += len;


	// MJH likes his memory clean
	swanMemset( ptr, 0, len );
	return ptr;
}
Esempio n. 9
0
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) {
    if (buf->dev) {
        // This buffer already has a device allocation
        return;
    }

    size_t size = __buf_size(user_context, buf);

    #ifdef DEBUG
    halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, "
                  "extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n",
                  size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3],
                  buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3],
                  buf->elem_size);
    #endif

    CUdeviceptr p;
    TIME_CALL( cuMemAlloc(&p, size), "dev_malloc");

    buf->dev = (uint64_t)p;
    halide_assert(user_context, buf->dev);

    #ifdef DEBUG
    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
    #endif
}
Esempio n. 10
0
    CUdeviceptr get_read_ptr_cuda(ComputeEnv *env,int devid, size_t read_byte_size) {
        if (cuda_valid_list[devid]) {
            return cuda_ptr_list[devid];
        }

        if (host_valid == false) {
            /* xx */
            abort();
            return 0;
        }

        CUDADev *dev = &env->cuda_dev_list[devid];
        cuCtxPushCurrent(dev->context);

        if (cuda_ptr_list[devid] == 0) {
            CUresult err;
            err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
            if (err != CUDA_SUCCESS) {
                abort();
            }
        }

        //double t0 = getsec();
        cuMemcpyHtoD(cuda_ptr_list[devid], host_ptr, read_byte_size);
        //double t1 = getsec();
        //env->transfer_wait = t1-t0;

        cuda_valid_list[devid] = true;

        CUcontext old;
        cuCtxPopCurrent(&old);

        return cuda_ptr_list[devid];
    }
Esempio n. 11
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel(.param .u64 kernel_param_0) {\n"
	".reg .s32 	%r<2>;\n"
	".reg .s64 	%rd<3>;\n"
	"bra 	BB1_2;\n"
	"ld.param.u64 	%rd1, [kernel_param_0];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"mov.u32 	%r1, 5;\n"
	"st.global.u32 	[%rd2], %r1;\n"
	"BB1_2: ret;\n"
	"}\n";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
	void * params[] = {&devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
	assert(hostValue == 10);
	std::cout << hostValue << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
Esempio n. 12
0
void sararfftnd_one_complex_to_real(
  sararfftnd_plan plan, sarafft_complex *h_data
) {
  CUdeviceptr d_data;
  size_t planSize = getPlanSize( plan );
  if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
    printf( "cuMemAlloc failed for plansize %li!\n", planSize );
    fflush ( stdout );
    exit( 90 );
  }
  if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
    printf( "cuMemcpyHtoD failed!\n" );
    fflush ( stdout );
    exit( 91 );
  }
  if ( CUFFT_SUCCESS != cufftExecC2R( plan, ( cufftComplex* )d_data, ( cufftReal* )d_data ) ) {
    printf( "cufftExecR2C failed!\n" );
    fflush ( stdout );
    exit( 92 );
  }
  if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
    printf( "cuMemcpyDtoH failed!\n" );
    fflush ( stdout );
    exit( 93 );
  }
  if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
    printf( "cuMemFree failed!\n" );
    fflush ( stdout );
    exit( 94 );
  }
}
Esempio n. 13
0
    bool prealloc(ComputeEnv *env) {
        int devid;
        if (host_ptr == nullptr) {
            host_ptr = _mm_malloc(byte_size, 64);
            if (host_ptr == nullptr) {
                return false;
            }
        }

        switch (env->target_processor.type) {
        case W2XCONV_PROC_HOST:
            break;

        case W2XCONV_PROC_OPENCL:
            devid = env->target_processor.devid;
            if (cl_ptr_list[devid] == nullptr) {
                cl_int err;
                OpenCLDev *dev = &env->cl_dev_list[devid];
                cl_ptr_list[devid] = clCreateBuffer(dev->context,
                                                    CL_MEM_READ_WRITE,
                                                    byte_size, nullptr, &err);
                if (cl_ptr_list[devid] == nullptr) {
                    return false;
                }

                /* touch memory to force allocation */
                char data = 0;
                err = clEnqueueWriteBuffer(dev->queue, cl_ptr_list[devid],
                                           CL_TRUE, 0, 1, &data, 0, nullptr, nullptr);
                if (err != CL_SUCCESS) {
                    clReleaseMemObject(cl_ptr_list[devid]);
                    cl_ptr_list[devid] = nullptr;
                    return false;
                }

            }
            break;

        case W2XCONV_PROC_CUDA:
            devid = env->target_processor.devid;

            if (cuda_ptr_list[devid] == 0) {
                CUresult err;
                CUDADev *dev = &env->cuda_dev_list[devid];
                cuCtxPushCurrent(dev->context);
                err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
                CUcontext old;
                cuCtxPopCurrent(&old);

                if (err != CUDA_SUCCESS) {
                    return false;
                }
            }
            break;

        }

        return true;
    }
Esempio n. 14
0
	void mem_alloc(device_memory& mem, MemoryType type)
	{
		cuda_push_context();
		CUdeviceptr device_pointer;
		cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
		mem.device_pointer = (device_ptr)device_pointer;
		cuda_pop_context();
	}
Esempio n. 15
0
CUresult CuContext::ByteAlloc(size_t size, DeviceMemPtr* ppMem) {
	DeviceMemPtr mem(new CuDeviceMem);
	CUresult result = cuMemAlloc(&mem->_deviceptr, size);
	HANDLE_RESULT();

	mem->_size = size;
	mem->_context = this;
	ppMem->swap(mem);
	return CUDA_SUCCESS;
}
Esempio n. 16
0
        /// Allocates memory buffer on the device associated with the given queue.
        device_vector(const command_queue &q, size_t n) : n(n) {
            if (n) {
                q.context().set_current();

                CUdeviceptr ptr;
                cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) );

                buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() );
            }
        }
Esempio n. 17
0
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags,
			   int *ret) {
    gpudata *res;
    cuda_context *ctx = (cuda_context *)c;
    int fl = CU_EVENT_DISABLE_TIMING;

    if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR);
    if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) ==
	(GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR);

    /* TODO: figure out how to make this work */
    if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR);

    res = malloc(sizeof(*res));
    if (res == NULL) FAIL(NULL, GA_SYS_ERROR);
    res->refcnt = 1;

    res->sz = size;
    res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY);

    cuda_enter(ctx);
    if (ctx->err != CUDA_SUCCESS) {
      free(res);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    if (ctx->flags & GA_CTX_MULTI_THREAD)
      fl |= CU_EVENT_BLOCKING_SYNC;
    ctx->err = cuEventCreate(&res->ev, fl);

    if (ctx->err != CUDA_SUCCESS) {
      free(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    if (size == 0) size = 1;

    ctx->err = cuMemAlloc(&res->ptr, size);
    if (ctx->err != CUDA_SUCCESS) {
        cuEventDestroy(res->ev);
        free(res);
        cuda_exit(ctx);
        FAIL(NULL, GA_IMPL_ERROR);
    }
    res->ctx = ctx;
    ctx->refcnt++;

    if (flags & GA_BUFFER_INIT) {
      ctx->err = cuMemcpyHtoD(res->ptr, data, size);
      if (ctx->err != CUDA_SUCCESS) {
	cuda_free(res);
	FAIL(NULL, GA_IMPL_ERROR)
      }
Esempio n. 18
0
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        // This buffer already has a device allocation
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUdeviceptr p;
    DEBUG_PRINTF( user_context, "    cuMemAlloc %lld -> ", size );
    CUresult err = cuMemAlloc(&p, size);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err));
        halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", p );
    }
    halide_assert(user_context, p);
    buf->dev = (uint64_t)p;

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Esempio n. 19
0
static void *
nvptx_alloc (size_t s)
{
  CUdeviceptr d;
  CUresult r;

  r = cuMemAlloc (&d, s);
  if (r == CUDA_ERROR_OUT_OF_MEMORY)
    return 0;
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
  return (void *)d;
}
Esempio n. 20
0
	void Buffer::allocate()
	{
		if (hostAccess() == BufferAccess::Unified)
		{
			// Allocate the required memory
			VCL_CU_SAFE_CALL(cuMemAllocManaged(&_devicePtr, size(), CU_MEM_ATTACH_GLOBAL));
		}
		else
		{
			// Allocate the required device memory
			VCL_CU_SAFE_CALL(cuMemAlloc(&_devicePtr, size()));
		}
	}
Esempio n. 21
0
SEXP
R_auto_cuMemAlloc(SEXP r_bytesize)
{
    SEXP r_ans = R_NilValue;
    CUdeviceptr dptr;
    size_t bytesize = REAL(r_bytesize)[0];
    CUresult ans;
    ans = cuMemAlloc(& dptr,  bytesize);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef((void*) dptr, "CUdeviceptr") ;
    return(r_ans);
}
Esempio n. 22
0
void setupSizeResource()
{
    deleteImage(img);
    free(img_content);
    checkCudaErrors(cuMemFree(d_img_content));

    item_size = width * height * 4;

    img = createImage(width, height);
    img_content = (unsigned char*)malloc(item_size);
    checkCudaErrors(cuMemAlloc(&d_img_content, item_size));
    checkCudaErrors(cuMemcpyHtoD(d_fragColor, &d_img_content, d_fragColor_bytes));
}
Esempio n. 23
0
int main(int argc, char *argv[])
{
	char c;
	CUcontext ctx;
	CUdevice dev = 0;
	void *toSpace;
	int status, free, total;
	CUdeviceptr ptr = (CUdeviceptr)NULL;
	int size;
	
	if(argc != 2){
		fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n");
		exit(1);
	}
	
	printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n");
	size = atoi(argv[1]);
	
	printf("\nTrying to allocate %iMB of memory on host and GPU\n",size);
	
	if(size <= 0){
		fprintf(stderr,"\nERROR: Memory must be greater than 0\n");
		exit(1);
	}
	
	status = cuInit(0);
	printf("Init status: %i\n",status); 

	status = cuCtxCreate(&ctx, 0, dev);
	printf("Context creation status: %i\n",status); 
	
	cuMemGetInfo(&free, &total);
	printf("Get memory info status: %i\n",status); 
	
	printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0);
	
	status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); 
	printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); 

	status = cuMemAlloc(&ptr, size*1024*1024);
	printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED");

	printf("\nPress any key to exit...");
	scanf("%c", &c);
	
	status = cuCtxDestroy(ctx);
	printf("Context destroy status: %i\n",status); 

	return 0;
}
Esempio n. 24
0
static int
init_thread(CUcontext *pctx,CUdevice dev,size_t s){
	CUdeviceptr ptr;
	CUresult cerr;

	if( (cerr = cuCtxCreate(pctx,0,dev)) ){
		fprintf(stderr," Error (%d) creating CUDA context\n",cerr);
		return -1;
	}
	if(s){
		if( (cerr = cuMemAlloc(&ptr,s)) ){
			fprintf(stderr," Error (%d) allocating %zub\n",cerr,s);
			return -1;
		}
	}
	return 0;
}
Esempio n. 25
0
int main(){
	init_test();
	const std::string test_source =
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry _Z6kernelPfi(\n"
	".param .u64 _Z6kernelPfi_param_0,\n"
	".param .u32 _Z6kernelPfi_param_1){\n"
	".reg .pred 	%p<2>;\n"
	".reg .f32 	%f<3>;\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<5>;\n"
	"ld.param.u64 	%rd1, [_Z6kernelPfi_param_0];\n"
	"ld.param.u32 	%r2, [_Z6kernelPfi_param_1];\n"
	"mov.u32 	%r1, %tid.x;\n"
	"setp.ge.u32	%p1, %r1, %r2;\n"
	"@%p1 bra 	BB0_2;\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"cvt.rn.f32.u32	%f1, %r1;\n"
	"mul.f32 	%f2, %f1, 0f3FC00000;\n"
	"mul.wide.u32 	%rd3, %r1, 4;\n"
	"add.s64 	%rd4, %rd2, %rd3;\n"
	"st.global.f32 	[%rd4], %f2;\n"
	"BB0_2:\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi"));
	CUdeviceptr devArray;
	int size = 10;
	float hostArray[size];
	memset(hostArray, 0, size * sizeof(hostArray[0]));
	cu_assert(cuMemAlloc(&devArray, sizeof(float) * size));
	void * params[] = {&devArray, &size};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size));
	cu_assert(cuMemFree(devArray));
	cu_assert(cuModuleUnload(modId));
	for (int i=0 ; i<size ; ++i)
		std::cout << hostArray[i] << '\n';
	return 0;
}
Esempio n. 26
0
void sararfftnd_one_real_to_complex(
  sararfftnd_plan plan, sarafft_real *h_data
) {
  CUdeviceptr d_data;
  size_t planSize = getPlanSize( plan );
//   printf( "planSize = %li!\n", planSize );
//   fflush ( stdout );
  cufftResult fftResult;
  CUresult cudaResult;
  if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
    printf( "cuMemAlloc failed for plansize %li!\n", planSize );
    fflush ( stdout );
    exit( 85 );
  }
  if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
    printf( "cuMemcpyHtoD failed!\n" );
    fflush ( stdout );
    exit( 86 );
  }
//   cudaError_t cudaError = cudaGetLastError();
//   if( cudaError != cudaSuccess ) {
//     printf( "CUDA Runtime API Error reported : %s\n", cudaGetErrorString(cudaError));
//     fflush ( stdout );
//     exit( 87 );
//   } else {
//     printf( "CUDA is in good shape.\n");
//     fflush ( stdout );
//   }
  fftResult = cufftExecR2C( plan, ( cufftReal* )d_data, ( cufftComplex* )d_data );
  if ( CUFFT_SUCCESS != fftResult ) {
    printf( "cufftExecR2C failed with code %d\n", fftResult );
    fflush ( stdout );
    exit( 87 );
  }
  if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
    printf( "cuMemcpyDtoH failed!\n" );
    fflush ( stdout );
    exit( 88 );
  }
  if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
    printf( "cuMemFree failed!\n" );
    fflush ( stdout );
    exit( 89 );
  }
}
Esempio n. 27
0
  memory_v* device_t<CUDA>::malloc(const uintptr_t bytes,
                                   void *source){
    OCCA_EXTRACT_DATA(CUDA, Device);

    memory_v *mem = new memory_t<CUDA>;

    mem->dev    = dev;
    mem->handle = new CUdeviceptr;
    mem->size   = bytes;

    OCCA_CUDA_CHECK("Device: malloc",
                    cuMemAlloc((CUdeviceptr*) mem->handle, bytes));

    if(source != NULL)
      mem->copyFrom(source, bytes, 0);

    return mem;
  }
GPUPtr GPUInterface::AllocateIntMemory(size_t length) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::AllocateIntMemory\n");
#endif

    GPUPtr ptr;

    SAFE_CUPP(cuMemAlloc(&ptr, SIZE_INT * length));

#ifdef BEAGLE_DEBUG_VALUES
    fprintf(stderr, "Allocated GPU memory %llu to %llu.\n", (unsigned long long)ptr, (unsigned long long)(ptr + length));
#endif

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::AllocateIntMemory\n");
#endif

    return ptr;
}
Esempio n. 29
0
WEAK void halide_dev_malloc(buffer_t* buf) {
    if (buf->dev) {
        // This buffer already has a device allocation
        return;
    }

    #ifndef NDEBUG
    fprintf(stderr, "dev_malloc of %zdx%zdx%zdx%zd (%zd bytes per element) (buf->dev = %p) buffer\n",
            buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->elem_size, (void*)buf->dev);
    #endif    

    CUdeviceptr p;
    TIME_CALL( cuMemAlloc(&p, buf_size(buf)), "dev_malloc");
    buf->dev = (uint64_t)p;
    assert(buf->dev);

    #ifndef NDEBUG
    assert(halide_validate_dev_pointer(buf));
    #endif
}
Esempio n. 30
0
/*
 * Allocate a new block and place in on the freelist. Will allocate
 * the bigger of the requested size and BLOCK_SIZE to avoid allocating
 * multiple small blocks.
 */
static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev,
                    size_t size) {
  CUdeviceptr ptr;
  gpudata *next;
  *prev = NULL;

  if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE))
    if (size < BLOCK_SIZE) size = BLOCK_SIZE;

  cuda_enter(ctx);

  ctx->err = cuMemAlloc(&ptr, size);
  if (ctx->err != CUDA_SUCCESS) {
    cuda_exit(ctx);
    return GA_IMPL_ERROR;
  }

  *res = new_gpudata(ctx, ptr, size);

  cuda_exit(ctx);

  if (*res == NULL) {
    cuMemFree(ptr);
    return GA_MEMORY_ERROR;
  }

  (*res)->flags |= CUDA_HEAD_ALLOC;

  /* Now that the block is allocated, enter it in the freelist */
  next = ctx->freeblocks;
  for (; next && next->ptr < (*res)->ptr; next = next->next) {
    *prev = next;
  }
  (*res)->next = next;
  if (*prev)
    (*prev)->next = *res;
  else
    ctx->freeblocks = *res;

  return GA_NO_ERROR;
}