const bool CUDARunner::AllocateResources(const int numb, const int numt) { bool allocated=true; CUresult rval; DeallocateResources(); m_in=(cuda_in *)malloc(sizeof(cuda_in)); m_out=(cuda_out *)malloc(numb*numt*sizeof(cuda_out)); rval=cuMemAlloc(&m_devin,sizeof(cuda_in)); if(rval!=CUDA_SUCCESS) { printf("Error %d allocating CUDA memory\n",rval); m_devin=0; allocated=false; } rval=cuMemAlloc(&m_devout,numb*numt*sizeof(cuda_out)); if(rval!=CUDA_SUCCESS) { printf("Error %d allocating CUDA memory\n",rval); m_devout=0; allocated=false; } printf("Done allocating CUDA resources for (%d,%d)\n",numb,numt); return allocated; }
void gpu_transpose_naive(int *dest, const int *src, int height, int width) { assert((width & (width - 1)) == 0); // TODO assert((height & (height - 1)) == 0); cuda->set_default_module("transpose.ptx"); CUfunction transpose_kernel = cuda->get_kernel("transpose_naive"); int grid_dim_x = width / BLOCK_DIM_X; int grid_dim_y = height / BLOCK_DIM_Y; CUdeviceptr device_src; CUdeviceptr device_dest; cuMemAlloc(&device_src, width*height*sizeof(int)); cuMemAlloc(&device_dest, width*height*sizeof(int)); cuMemcpyHtoD(device_src, src, width*height*sizeof(int)); void *args[] = {&device_dest, &device_src, &height, &width}; cuda->launch_kernel_2d_sync(transpose_kernel, grid_dim_x, grid_dim_y, BLOCK_DIM_X, BLOCK_DIM_Y, args); cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int)); cuMemFree(device_src); cuMemFree(device_dest); cuda->ctx_synchronize(); }
/* * get device memory */ void get_dev_mem(void){ res = cuMemAlloc(&x_dev, N * sizeof(double)); if(res != CUDA_SUCCESS){ printf("cuMemAlloc(x) failed: res = %s\n", conv(res)); exit(1); } res = cuMemAlloc(&v_dev, N * sizeof(double)); if(res != CUDA_SUCCESS){ printf("cuMemAlloc(v) failed: res = %s\n", conv(res)); exit(1); } res = cuMemAlloc(&error_dev, sizeof(int)); if(res != CUDA_SUCCESS){ printf("cuMemAlloc(error) failed: res = %s\n", conv(res)); exit(1); } res = cuMemAlloc(&s_time_dev, sizeof(double)); if(res != CUDA_SUCCESS){ printf("cuMemAlloc(s_time) failed: res = %s\n", conv(res)); exit(1); } }
void gpu_transpose_with_shared_mem(int *dest, const int *src, int height, int width) { assert((width & (width - 1)) == 0); // TODO assert((height & (height - 1)) == 0); cuda->set_default_module(CUDA_PTX_PREFIX"transpose.cu.ptx"); CUfunction transpose_kernel = cuda->get_kernel("transpose_with_shared_mem"); int grid_dim_x = width / TILE_DIM; int grid_dim_y = height / TILE_DIM; CUdeviceptr device_src; CUdeviceptr device_dest; cuMemAlloc(&device_src, width*height*sizeof(int)); cuMemAlloc(&device_dest, width*height*sizeof(int)); cuMemcpyHtoD(device_src, src, width*height*sizeof(int)); void *args[] = {&device_dest, &device_src}; cuda->launch_kernel_2d_sync(transpose_kernel, grid_dim_x, grid_dim_y, TILE_DIM, 2, args); cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int)); cuMemFree(device_src); cuMemFree(device_dest); cuda->ctx_synchronize(); }
Object cuda_over_map(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_res; errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]]))); CUdeviceptr dps[argcv[0]]; void *args[argcv[0]+2]; int size = INT_MAX; for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; if (a->size < size) size = a->size; errcheck(cuMemAlloc(&dps[i], size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float))); args[i+1] = &dps[i]; } struct CudaFloatArray *r = (struct CudaFloatArray *)(alloc_CudaFloatArray(size)); int fsize = sizeof(float) * size; errcheck(cuMemAlloc(&d_res, fsize)); errcheck(cuMemcpyHtoD(d_res, &r->data, fsize)); args[0] = &d_res; args[argcv[0]+1] = &size; int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; char name[256]; strcpy(name, "block"); strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); errcheck(cuMemcpyDtoH(&r->data, d_res, fsize)); cuMemFree(d_res); for (int i=0; i<argcv[0]; i++) cuMemFree(dps[i]); return (Object)r; }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
void *swanMalloc( size_t len ) { void *ptr; CUdeviceptr dptr; CUresult err; try_init(); if( len == 0 ) { // printf("SWAN: WARNING - swnaMalloc() called with 0\n"); return NULL; } err = cuMemAlloc( (CUdeviceptr*) &dptr, len ); ptr = (void*)dptr; if ( err != CUDA_SUCCESS ) { printf("Attempted to allocate %lu bytes (%lu already allocated)\n", len, state.bytes_allocated ); abort(); error("swanMalloc failed\n" ); } state.bytes_allocated += len; // MJH likes his memory clean swanMemset( ptr, 0, len ); return ptr; }
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) { if (buf->dev) { // This buffer already has a device allocation return; } size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, " "extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n", size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3], buf->elem_size); #endif CUdeviceptr p; TIME_CALL( cuMemAlloc(&p, size), "dev_malloc"); buf->dev = (uint64_t)p; halide_assert(user_context, buf->dev); #ifdef DEBUG halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); #endif }
CUdeviceptr get_read_ptr_cuda(ComputeEnv *env,int devid, size_t read_byte_size) { if (cuda_valid_list[devid]) { return cuda_ptr_list[devid]; } if (host_valid == false) { /* xx */ abort(); return 0; } CUDADev *dev = &env->cuda_dev_list[devid]; cuCtxPushCurrent(dev->context); if (cuda_ptr_list[devid] == 0) { CUresult err; err = cuMemAlloc(&cuda_ptr_list[devid], byte_size); if (err != CUDA_SUCCESS) { abort(); } } //double t0 = getsec(); cuMemcpyHtoD(cuda_ptr_list[devid], host_ptr, read_byte_size); //double t1 = getsec(); //env->transfer_wait = t1-t0; cuda_valid_list[devid] = true; CUcontext old; cuCtxPopCurrent(&old); return cuda_ptr_list[devid]; }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel(.param .u64 kernel_param_0) {\n" ".reg .s32 %r<2>;\n" ".reg .s64 %rd<3>;\n" "bra BB1_2;\n" "ld.param.u64 %rd1, [kernel_param_0];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "mov.u32 %r1, 5;\n" "st.global.u32 [%rd2], %r1;\n" "BB1_2: ret;\n" "}\n"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue))); void * params[] = {&devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue))); assert(hostValue == 10); std::cout << hostValue << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
void sararfftnd_one_complex_to_real( sararfftnd_plan plan, sarafft_complex *h_data ) { CUdeviceptr d_data; size_t planSize = getPlanSize( plan ); if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) { printf( "cuMemAlloc failed for plansize %li!\n", planSize ); fflush ( stdout ); exit( 90 ); } if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) { printf( "cuMemcpyHtoD failed!\n" ); fflush ( stdout ); exit( 91 ); } if ( CUFFT_SUCCESS != cufftExecC2R( plan, ( cufftComplex* )d_data, ( cufftReal* )d_data ) ) { printf( "cufftExecR2C failed!\n" ); fflush ( stdout ); exit( 92 ); } if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) { printf( "cuMemcpyDtoH failed!\n" ); fflush ( stdout ); exit( 93 ); } if ( CUDA_SUCCESS != cuMemFree( d_data ) ) { printf( "cuMemFree failed!\n" ); fflush ( stdout ); exit( 94 ); } }
bool prealloc(ComputeEnv *env) { int devid; if (host_ptr == nullptr) { host_ptr = _mm_malloc(byte_size, 64); if (host_ptr == nullptr) { return false; } } switch (env->target_processor.type) { case W2XCONV_PROC_HOST: break; case W2XCONV_PROC_OPENCL: devid = env->target_processor.devid; if (cl_ptr_list[devid] == nullptr) { cl_int err; OpenCLDev *dev = &env->cl_dev_list[devid]; cl_ptr_list[devid] = clCreateBuffer(dev->context, CL_MEM_READ_WRITE, byte_size, nullptr, &err); if (cl_ptr_list[devid] == nullptr) { return false; } /* touch memory to force allocation */ char data = 0; err = clEnqueueWriteBuffer(dev->queue, cl_ptr_list[devid], CL_TRUE, 0, 1, &data, 0, nullptr, nullptr); if (err != CL_SUCCESS) { clReleaseMemObject(cl_ptr_list[devid]); cl_ptr_list[devid] = nullptr; return false; } } break; case W2XCONV_PROC_CUDA: devid = env->target_processor.devid; if (cuda_ptr_list[devid] == 0) { CUresult err; CUDADev *dev = &env->cuda_dev_list[devid]; cuCtxPushCurrent(dev->context); err = cuMemAlloc(&cuda_ptr_list[devid], byte_size); CUcontext old; cuCtxPopCurrent(&old); if (err != CUDA_SUCCESS) { return false; } } break; } return true; }
void mem_alloc(device_memory& mem, MemoryType type) { cuda_push_context(); CUdeviceptr device_pointer; cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size())) mem.device_pointer = (device_ptr)device_pointer; cuda_pop_context(); }
CUresult CuContext::ByteAlloc(size_t size, DeviceMemPtr* ppMem) { DeviceMemPtr mem(new CuDeviceMem); CUresult result = cuMemAlloc(&mem->_deviceptr, size); HANDLE_RESULT(); mem->_size = size; mem->_context = this; ppMem->swap(mem); return CUDA_SUCCESS; }
/// Allocates memory buffer on the device associated with the given queue. device_vector(const command_queue &q, size_t n) : n(n) { if (n) { q.context().set_current(); CUdeviceptr ptr; cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) ); buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() ); } }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res; cuda_context *ctx = (cuda_context *)c; int fl = CU_EVENT_DISABLE_TIMING; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); res = malloc(sizeof(*res)); if (res == NULL) FAIL(NULL, GA_SYS_ERROR); res->refcnt = 1; res->sz = size; res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); FAIL(NULL, GA_IMPL_ERROR); } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, fl); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } if (size == 0) size = 1; ctx->err = cuMemAlloc(&res->ptr, size); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->ev); free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; if (flags & GA_BUFFER_INIT) { ctx->err = cuMemcpyHtoD(res->ptr, data, size); if (ctx->err != CUDA_SUCCESS) { cuda_free(res); FAIL(NULL, GA_IMPL_ERROR) }
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { // This buffer already has a device allocation halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUdeviceptr p; DEBUG_PRINTF( user_context, " cuMemAlloc %lld -> ", size ); CUresult err = cuMemAlloc(&p, size); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err)); halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", p ); } halide_assert(user_context, p); buf->dev = (uint64_t)p; #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
static void * nvptx_alloc (size_t s) { CUdeviceptr d; CUresult r; r = cuMemAlloc (&d, s); if (r == CUDA_ERROR_OUT_OF_MEMORY) return 0; if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); return (void *)d; }
void Buffer::allocate() { if (hostAccess() == BufferAccess::Unified) { // Allocate the required memory VCL_CU_SAFE_CALL(cuMemAllocManaged(&_devicePtr, size(), CU_MEM_ATTACH_GLOBAL)); } else { // Allocate the required device memory VCL_CU_SAFE_CALL(cuMemAlloc(&_devicePtr, size())); } }
SEXP R_auto_cuMemAlloc(SEXP r_bytesize) { SEXP r_ans = R_NilValue; CUdeviceptr dptr; size_t bytesize = REAL(r_bytesize)[0]; CUresult ans; ans = cuMemAlloc(& dptr, bytesize); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef((void*) dptr, "CUdeviceptr") ; return(r_ans); }
void setupSizeResource() { deleteImage(img); free(img_content); checkCudaErrors(cuMemFree(d_img_content)); item_size = width * height * 4; img = createImage(width, height); img_content = (unsigned char*)malloc(item_size); checkCudaErrors(cuMemAlloc(&d_img_content, item_size)); checkCudaErrors(cuMemcpyHtoD(d_fragColor, &d_img_content, d_fragColor_bytes)); }
int main(int argc, char *argv[]) { char c; CUcontext ctx; CUdevice dev = 0; void *toSpace; int status, free, total; CUdeviceptr ptr = (CUdeviceptr)NULL; int size; if(argc != 2){ fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n"); exit(1); } printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n"); size = atoi(argv[1]); printf("\nTrying to allocate %iMB of memory on host and GPU\n",size); if(size <= 0){ fprintf(stderr,"\nERROR: Memory must be greater than 0\n"); exit(1); } status = cuInit(0); printf("Init status: %i\n",status); status = cuCtxCreate(&ctx, 0, dev); printf("Context creation status: %i\n",status); cuMemGetInfo(&free, &total); printf("Get memory info status: %i\n",status); printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0); status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); status = cuMemAlloc(&ptr, size*1024*1024); printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); printf("\nPress any key to exit..."); scanf("%c", &c); status = cuCtxDestroy(ctx); printf("Context destroy status: %i\n",status); return 0; }
static int init_thread(CUcontext *pctx,CUdevice dev,size_t s){ CUdeviceptr ptr; CUresult cerr; if( (cerr = cuCtxCreate(pctx,0,dev)) ){ fprintf(stderr," Error (%d) creating CUDA context\n",cerr); return -1; } if(s){ if( (cerr = cuMemAlloc(&ptr,s)) ){ fprintf(stderr," Error (%d) allocating %zub\n",cerr,s); return -1; } } return 0; }
int main(){ init_test(); const std::string test_source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry _Z6kernelPfi(\n" ".param .u64 _Z6kernelPfi_param_0,\n" ".param .u32 _Z6kernelPfi_param_1){\n" ".reg .pred %p<2>;\n" ".reg .f32 %f<3>;\n" ".reg .s32 %r<3>;\n" ".reg .s64 %rd<5>;\n" "ld.param.u64 %rd1, [_Z6kernelPfi_param_0];\n" "ld.param.u32 %r2, [_Z6kernelPfi_param_1];\n" "mov.u32 %r1, %tid.x;\n" "setp.ge.u32 %p1, %r1, %r2;\n" "@%p1 bra BB0_2;\n" "cvta.to.global.u64 %rd2, %rd1;\n" "cvt.rn.f32.u32 %f1, %r1;\n" "mul.f32 %f2, %f1, 0f3FC00000;\n" "mul.wide.u32 %rd3, %r1, 4;\n" "add.s64 %rd4, %rd2, %rd3;\n" "st.global.f32 [%rd4], %f2;\n" "BB0_2:\n" "ret;\n" "}"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi")); CUdeviceptr devArray; int size = 10; float hostArray[size]; memset(hostArray, 0, size * sizeof(hostArray[0])); cu_assert(cuMemAlloc(&devArray, sizeof(float) * size)); void * params[] = {&devArray, &size}; auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size)); cu_assert(cuMemFree(devArray)); cu_assert(cuModuleUnload(modId)); for (int i=0 ; i<size ; ++i) std::cout << hostArray[i] << '\n'; return 0; }
void sararfftnd_one_real_to_complex( sararfftnd_plan plan, sarafft_real *h_data ) { CUdeviceptr d_data; size_t planSize = getPlanSize( plan ); // printf( "planSize = %li!\n", planSize ); // fflush ( stdout ); cufftResult fftResult; CUresult cudaResult; if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) { printf( "cuMemAlloc failed for plansize %li!\n", planSize ); fflush ( stdout ); exit( 85 ); } if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) { printf( "cuMemcpyHtoD failed!\n" ); fflush ( stdout ); exit( 86 ); } // cudaError_t cudaError = cudaGetLastError(); // if( cudaError != cudaSuccess ) { // printf( "CUDA Runtime API Error reported : %s\n", cudaGetErrorString(cudaError)); // fflush ( stdout ); // exit( 87 ); // } else { // printf( "CUDA is in good shape.\n"); // fflush ( stdout ); // } fftResult = cufftExecR2C( plan, ( cufftReal* )d_data, ( cufftComplex* )d_data ); if ( CUFFT_SUCCESS != fftResult ) { printf( "cufftExecR2C failed with code %d\n", fftResult ); fflush ( stdout ); exit( 87 ); } if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) { printf( "cuMemcpyDtoH failed!\n" ); fflush ( stdout ); exit( 88 ); } if ( CUDA_SUCCESS != cuMemFree( d_data ) ) { printf( "cuMemFree failed!\n" ); fflush ( stdout ); exit( 89 ); } }
memory_v* device_t<CUDA>::malloc(const uintptr_t bytes, void *source){ OCCA_EXTRACT_DATA(CUDA, Device); memory_v *mem = new memory_t<CUDA>; mem->dev = dev; mem->handle = new CUdeviceptr; mem->size = bytes; OCCA_CUDA_CHECK("Device: malloc", cuMemAlloc((CUdeviceptr*) mem->handle, bytes)); if(source != NULL) mem->copyFrom(source, bytes, 0); return mem; }
GPUPtr GPUInterface::AllocateIntMemory(size_t length) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::AllocateIntMemory\n"); #endif GPUPtr ptr; SAFE_CUPP(cuMemAlloc(&ptr, SIZE_INT * length)); #ifdef BEAGLE_DEBUG_VALUES fprintf(stderr, "Allocated GPU memory %llu to %llu.\n", (unsigned long long)ptr, (unsigned long long)(ptr + length)); #endif #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tLeaving GPUInterface::AllocateIntMemory\n"); #endif return ptr; }
WEAK void halide_dev_malloc(buffer_t* buf) { if (buf->dev) { // This buffer already has a device allocation return; } #ifndef NDEBUG fprintf(stderr, "dev_malloc of %zdx%zdx%zdx%zd (%zd bytes per element) (buf->dev = %p) buffer\n", buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->elem_size, (void*)buf->dev); #endif CUdeviceptr p; TIME_CALL( cuMemAlloc(&p, buf_size(buf)), "dev_malloc"); buf->dev = (uint64_t)p; assert(buf->dev); #ifndef NDEBUG assert(halide_validate_dev_pointer(buf)); #endif }
/* * Allocate a new block and place in on the freelist. Will allocate * the bigger of the requested size and BLOCK_SIZE to avoid allocating * multiple small blocks. */ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, size_t size) { CUdeviceptr ptr; gpudata *next; *prev = NULL; if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) if (size < BLOCK_SIZE) size = BLOCK_SIZE; cuda_enter(ctx); ctx->err = cuMemAlloc(&ptr, size); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *res = new_gpudata(ctx, ptr, size); cuda_exit(ctx); if (*res == NULL) { cuMemFree(ptr); return GA_MEMORY_ERROR; } (*res)->flags |= CUDA_HEAD_ALLOC; /* Now that the block is allocated, enter it in the freelist */ next = ctx->freeblocks; for (; next && next->ptr < (*res)->ptr; next = next->next) { *prev = next; } (*res)->next = next; if (*prev) (*prev)->next = *res; else ctx->freeblocks = *res; return GA_NO_ERROR; }