extern "C" void sarafft_init() { printf( "Cuda is about to be initialized!\n" ); fflush ( stdout ); char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" ); if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) { printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" ); fflush ( stdout ); exit( 80 ); } int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK ); printf( "Local rank is %d\n", localRank ); fflush ( stdout ); if ( CUDA_SUCCESS != cuInit( 0 ) ) { printf( "cuInit failed!\n" ); fflush ( stdout ); exit( 81 ); } CUdevice device; if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) { printf( "cuDeviceGet failed!\n" ); fflush ( stdout ); exit( 82 ); } if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) { printf( "cuCtxCreate failed!\n" ); fflush ( stdout ); exit( 83 ); } printf( "Cuda was initialized successfully!\n" ); fflush ( stdout ); }
CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { background = background_; cuDevId = info.num; cuDevice = 0; cuContext = 0; /* intialize */ if(cuda_error(cuInit(0))) return; /* setup device and context */ if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) return; CUresult result; if(background) { result = cuCtxCreate(&cuContext, 0, cuDevice); } else { result = cuGLCtxCreate(&cuContext, 0, cuDevice); if(result != CUDA_SUCCESS) { result = cuCtxCreate(&cuContext, 0, cuDevice); background = true; } } if(cuda_error_(result, "cuCtxCreate")) return; cuda_pop_context(); }
static void *do_init(CUdevice dev, int flags, int *ret) { cuda_context *res; CUcontext ctx; unsigned int fl = CU_CTX_SCHED_AUTO; int i; CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(NULL); if (i != 1) FAIL(NULL, GA_UNSUPPORTED_ERROR); err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
bool initCuda(CUcontext & cuContext) { // Initialize Cuda CUresult cerr; int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { fprintf(stderr, "Sorry, no CUDA device found"); return false; } int selectedDevice = 0; if (selectedDevice >= deviceCount) { fprintf(stderr, "Choose device ID between 0 and %d\n", deviceCount-1); return false; } // Initialize the CUDA device CUdevice cuDevice; cerr = cuDeviceGet(&cuDevice,selectedDevice); checkError(cerr); cerr = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST|CU_CTX_BLOCKING_SYNC, cuDevice); checkError(cerr); }
int mcopy_gpu_init(struct device_info *device_info) { char fname[256]; CUresult res; /* printf("madd_gpu_init called.\n"); */ /* Initialization */ if ((res = cuInit(0)) != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }
int main(int argc, char* argv[]) { cuInit(0); int devs = 0; cuDeviceGetCount(&devs); assert(devs > 0); CUdevice dev; CUresult status; CUcontext ctx = 0; cuDeviceGet(&dev, 0); cuCtxCreate(&ctx, 0, dev); { size_t f = 0, t = 0; CUresult r = cuMemGetInfo(&f, &t); fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t ); } __init("\n"); printf("\nPress any key to exit..."); char c; scanf("%c", &c); return 0; }
size_t initContext(JNIEnv * env, jint max_blocks_per_proc, jint max_threads_per_block) { size_t to_space_size; int status; int deviceCount = 0; size_t f_mem; size_t t_mem; jint num_blocks; status = cuDeviceGetCount(&deviceCount); CHECK_STATUS_RTN(env,"error in cuDeviceGetCount",status, 0); getBestDevice(env); status = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cuDevice); CHECK_STATUS_RTN(env,"error in cuCtxCreate",status, 0) status = cuMemGetInfo (&f_mem, &t_mem); CHECK_STATUS_RTN(env,"error in cuMemGetInfo",status, 0) to_space_size = f_mem; //space for 100 types in the scene classMemSize = sizeof(jint)*100; num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc; gc_space_size = 1024; to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= gc_space_size; to_space_size -= classMemSize; return to_space_size; }
int main() { int ngpu; CUdevice cuDevice; CUcontext cuContext; cuInit(0); cuDeviceGetCount(&ngpu); //printf("ngpu = %d\n", ngpu); size_t *totals, *frees ; totals = (size_t *) calloc (ngpu, sizeof(size_t)); frees = (size_t *) calloc (ngpu, sizeof(size_t)); int tid; omp_set_num_threads(ngpu); #pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals) { tid = omp_get_thread_num(); //printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid); cuDeviceGet(&cuDevice, tid); cuCtxCreate(&cuContext, tid, cuDevice); cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]); } printf ("\ttotal\t\tfree\t\tused\n"); for(int i=0; i<ngpu; i++) { printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]); } return 0; }
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f) { CUresult res; CUdevice dev; res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return res; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxCreate(pctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return res; } res = cuModuleLoad(pmod, f); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); cuCtxDestroy(*pctx); return res; } return CUDA_SUCCESS; }
bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane) { Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12"); TexRes &r = res[plane]; if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false); if (USE_STREAM) { CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); } qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); r.texture = tex; r.w = w; r.h = h; r.H = H; return true; }
int init_cuda_context (void) { #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; int local_rank, dev_count; int dev_id = 0; char * str; if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } #endif return 0; }
Object cuda_over_map(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_res; errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]]))); CUdeviceptr dps[argcv[0]]; void *args[argcv[0]+2]; int size = INT_MAX; for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; if (a->size < size) size = a->size; errcheck(cuMemAlloc(&dps[i], size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float))); args[i+1] = &dps[i]; } struct CudaFloatArray *r = (struct CudaFloatArray *)(alloc_CudaFloatArray(size)); int fsize = sizeof(float) * size; errcheck(cuMemAlloc(&d_res, fsize)); errcheck(cuMemcpyHtoD(d_res, &r->data, fsize)); args[0] = &d_res; args[argcv[0]+1] = &size; int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; char name[256]; strcpy(name, "block"); strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); errcheck(cuMemcpyDtoH(&r->data, d_res, fsize)); cuMemFree(d_res); for (int i=0; i<argcv[0]; i++) cuMemFree(dps[i]); return (Object)r; }
static CUresult initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, int argc, char** argv, const char* cubin_name, const char* kernel_name) { CUdevice cuDevice; CUT_DEVICE_INIT_DRV(cuDevice, argc, argv); print_GetProperties(cuDevice); CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice ); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf("(1) context creation successful\n"); char* module_path = cutFindFilePath(cubin_name, executablePath); printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path); if(module_path != NULL) printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3)); char* data_path = "./data/"; size_t len_path = strlen(data_path); size_t len_fn = strlen(cubin_name); // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn); char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn)); strcpy(module_path_new, data_path); strcat(module_path_new, cubin_name); strcat(module_path_new, "\0"); if (module_path_new == 0) { status = CUDA_ERROR_NOT_FOUND; Error(_cuContext, status); } FILE *fp = fopen(module_path_new,"r"); if( fp ) { printf("(2) cubin_File found in modulepath:%s\n", module_path_new); fclose(fp); } else { printf("(2) cubin file not exist: %s\n", module_path_new); } CUmodule cuModule; status = cuModuleLoad(&cuModule, module_path_new); cutFree(module_path_new); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf ("(3) module Load successful\n"); CUfunction cuFunction = 0; status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name); if ( CUDA_SUCCESS != status) { Error(_cuContext, status); } else printf ("(4) getFunction successful w/cuFunction\n"); *mathop = cuFunction; return CUDA_SUCCESS; }
int init_accel (void) { #if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_) char * str; int local_rank, dev_count; int dev_id = 0; #endif #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case managed: case cuda: if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: if ((str = getenv("LOCAL_RANK")) != NULL) { dev_count = acc_get_num_devices(acc_device_not_host); local_rank = atoi(str); dev_id = local_rank % dev_count; } acc_set_device_num (dev_id, acc_device_not_host); break; #endif default: fprintf(stderr, "Invalid device type, should be cuda or openacc\n"); return 1; } return 0; }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice)); } else { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice)); } if (kernelMap == NULL) { // kernels have not yet been initialized; do so now. Hopefully, this only occurs once per library load. InitializeKernelMap(); } int id = paddedStateCount; if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) { id *= -1; } if (kernelMap->count(id) == 0) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource = (*kernelMap)[id].copy(); kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
CUresult CreateCuContext(CuDevice* device, uint flags, ContextPtr* ppContext) { ContextPtr context(new CuContext(true)); CUresult result = cuCtxCreate(&context->_h, flags, device->Handle()); HANDLE_RESULT(); context->_destroyOnDtor = true; context->_device = device; ppContext->swap(context); return CUDA_SUCCESS; }
/* * Initializaiton in order to use kernel program */ void init_cuda(void){ thread_num = (N <= 16) ? N : 16 ; block_num = N / (thread_num*thread_num); if(N % (thread_num*thread_num) != 0) block_num++; res = cuInit(0); if(res != CUDA_SUCCESS){ printf("cuInit failed: res = %s\n", conv(res)); exit(1); } res = cuDeviceGet(&dev, 0); if(res != CUDA_SUCCESS){ printf("cuDeviceGet failed: res = %s\n", conv(res)); exit(1); } res = cuCtxCreate(&ctx, 0, dev); if(res != CUDA_SUCCESS){ printf("cuCtxCreate failed: res = %s\n", conv(res)); exit(1); } res = cuModuleLoad(&module, "./cuda_main.cubin"); if(res != CUDA_SUCCESS){ printf("cuModuleLoad() failed: res = %s\n", conv(res)); exit(1); } res = cuModuleGetFunction(&function, module, "cuda_main"); if(res != CUDA_SUCCESS){ printf("cuModuleGetFunction() failed: res = %s\n", conv(res)); exit(1); } /* * preparation for launch kernel */ res = cuFuncSetSharedSize(function, 0x40); /* just random */ if(res != CUDA_SUCCESS){ printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res)); exit(1); } res = cuFuncSetBlockShape(function, thread_num, thread_num, 1); if(res != CUDA_SUCCESS){ printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res)); exit(1); } }
int main(int argc, char *argv[]) { char c; CUcontext ctx; CUdevice dev = 0; void *toSpace; int status, free, total; CUdeviceptr ptr = (CUdeviceptr)NULL; int size; if(argc != 2){ fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n"); exit(1); } printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n"); size = atoi(argv[1]); printf("\nTrying to allocate %iMB of memory on host and GPU\n",size); if(size <= 0){ fprintf(stderr,"\nERROR: Memory must be greater than 0\n"); exit(1); } status = cuInit(0); printf("Init status: %i\n",status); status = cuCtxCreate(&ctx, 0, dev); printf("Context creation status: %i\n",status); cuMemGetInfo(&free, &total); printf("Get memory info status: %i\n",status); printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0); status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); status = cuMemAlloc(&ptr, size*1024*1024); printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); printf("\nPress any key to exit..."); scanf("%c", &c); status = cuCtxDestroy(ctx); printf("Context destroy status: %i\n",status); return 0; }
SEXP R_cuCtxCreate(SEXP r_flags, SEXP r_dev) { SEXP r_ans = R_NilValue; CUcontext pctx; unsigned int flags = REAL(r_flags)[0]; int dev = INTEGER(r_dev)[0]; CUresult ans; ans = cuCtxCreate(& pctx, flags, dev); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(pctx, "CUcontext") ; return(r_ans); }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
static int init_thread(CUcontext *pctx,CUdevice dev,size_t s){ CUdeviceptr ptr; CUresult cerr; if( (cerr = cuCtxCreate(pctx,0,dev)) ){ fprintf(stderr," Error (%d) creating CUDA context\n",cerr); return -1; } if(s){ if( (cerr = cuMemAlloc(&ptr,s)) ){ fprintf(stderr," Error (%d) allocating %zub\n",cerr,s); return -1; } } return 0; }
static void *cuda_init(int ord, int flags, int *ret) { CUdevice dev; CUcontext ctx; cuda_context *res; static int init_done = 0; unsigned int fl = CU_CTX_SCHED_AUTO; if (ord == -1) { /* Grab the ambient context */ err = cuCtxGetCurrent(&ctx); CHKFAIL(NULL); res = cuda_make_ctx(ctx, DONTFREE); if (res == NULL) { FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; return res; } if (!init_done) { err = cuInit(0); CHKFAIL(NULL); init_done = 1; } err = cuDeviceGet(&dev, ord); CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); res->flags |= flags; if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
void device_t<CUDA>::setup(argInfoMap &aim){ cuda::init(); data = new CUDADeviceData_t; OCCA_EXTRACT_DATA(CUDA, Device); if(!aim.has("deviceID")){ std::cout << "[CUDA] device not given [deviceID]\n"; throw 1; } const int deviceID = aim.iGet("deviceID"); OCCA_CUDA_CHECK("Device: Creating Device", cuDeviceGet(&data_.device, deviceID)); OCCA_CUDA_CHECK("Device: Creating Context", cuCtxCreate(&data_.context, CU_CTX_SCHED_AUTO, data_.device)); }
ERL_NIF_TERM pteracuda_nifs_new_context(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { CUdevice device; int deviceNum = 0; PCudaContextRef *ref = (PCudaContextRef *) enif_alloc_resource(pteracuda_context_resource, sizeof(PCudaContextRef)); if (!ref) { return OOM_ERROR; } if (argc == 1 && !enif_get_int(env, argv[0], &deviceNum)) { return enif_make_badarg(env); } if (cuDeviceGet(&device, deviceNum) == CUDA_SUCCESS && cuCtxCreate(&(ref->ctx), CU_CTX_SCHED_AUTO, device) == CUDA_SUCCESS) { ref->destroyed = false; ERL_NIF_TERM result = enif_make_resource(env, ref); enif_release_resource(ref); return enif_make_tuple2(env, ATOM_OK, result); } else { return ATOM_ERROR; } }
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream) { // Initialize the device and create the context cuInit(0); cuDeviceGet(phDevice, device); CUresult status = cuCtxCreate(phContext, 0, *phDevice); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not create context\n"; exit(0);} status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx"); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);} // Create stream status = cuStreamCreate(phStream, 0); if (status != CUDA_SUCCESS) {printf("ERROR: during stream creation\n"); exit(0);} return status; }
int main(int argc, char ** argv) { int dev_count = 0; CUdevice device; CUcontext context; CUmodule module; CUfunction function; cuInit(0); cuDeviceGetCount(&dev_count); if (dev_count < 1) return -1; cuDeviceGet( &device, 0 ); cuCtxCreate( &context, 0, device ); cuModuleLoad( &module, "hello.cuda_runtime.ptx" ); cuModuleGetFunction( &function, module, "_Z6kernelPf" ); int N = 512; CUdeviceptr pData; cuMemAlloc( &pData, N * sizeof(float) ); cuFuncSetBlockShape( function, N, 1, 1 ); cuParamSeti( function, 0, pData ); cuParamSetSize( function, 4 ); cuLaunchGrid( function, 1, 1 ); float * pHostData = new float[N]; cuMemcpyDtoH( pHostData, pData, N * sizeof( float) ); cuMemFree( pData ); delete [] pHostData; return 0; }
__attribute__((constructor)) void initTrace() { //get the arguments from the environment variables int deviceId, sampRate; CUcontext cuCtx; deviceId = atoi(getenv("GPU_DEVICE_ID")); cuInit(0); cuCtxCreate(&cuCtx,0,deviceId); CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUpti_ActivityPCSamplingConfig config; sampRate=atoi(getenv("PC_SAMPLING_RATE")); config.samplingPeriod= sampRate; CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config)); }
int madd_gpu_init(struct device_info *device_info) { char fname[256]; CUresult res; /* printf("madd_gpu_init called.\n"); */ /* Initialization */ if ((res = cuInit(0)) != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } /* binary files are located in the same directory as the source code */ if ((res = cuModuleLoad(&device_info->module, MODULE_FILE_NAME)) != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } if ((res = cuModuleGetFunction(&device_info->kernel, device_info->module, KERNEL_NAME)) != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } return 0; }
bool VideoDecoderCUDAPrivate::initCuda() { CUresult result = cuInit(0); if (result != CUDA_SUCCESS) { available = false; qWarning("cuInit(0) faile (%d)", result); return false; } cudev = GetMaxGflopsGraphicsDeviceId(); int clockRate; cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev); int major, minor; cuDeviceComputeCapability(&major, &minor, cudev); char devname[256]; cuDeviceGetName(devname, 256, cudev); description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000); //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO? CUcontext cuCurrent = NULL; result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { qWarning("cuCtxPopCurrent: %d\n", result); return false; } checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx)); { AutoCtxLock lock(this, vid_ctx_lock); Q_UNUSED(lock); //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING) checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT //require compute capability >= 1.1 //flag: Reserved for future use, must be 0 //cuStreamAddCallback(stream, CUstreamCallback, this, 0); } return true; }