bool load_kernels(bool experimental) { /* check if cuda init succeeded */ if(cuContext == 0) return false; /* check if GPU is supported with current feature set */ if(!support_device(experimental)) return false; /* get kernel */ string cubin = compile_kernel(); if(cubin == "") return false; /* open module */ cuda_push_context(); CUresult result = cuModuleLoad(&cuModule, cubin.c_str()); if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); cuda_pop_context(); return (result == CUDA_SUCCESS); }
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f) { CUresult res; CUdevice dev; res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return res; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxCreate(pctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return res; } res = cuModuleLoad(pmod, f); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); cuCtxDestroy(*pctx); return res; } return CUDA_SUCCESS; }
Object cuda_over_map(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_res; errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]]))); CUdeviceptr dps[argcv[0]]; void *args[argcv[0]+2]; int size = INT_MAX; for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; if (a->size < size) size = a->size; errcheck(cuMemAlloc(&dps[i], size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float))); args[i+1] = &dps[i]; } struct CudaFloatArray *r = (struct CudaFloatArray *)(alloc_CudaFloatArray(size)); int fsize = sizeof(float) * size; errcheck(cuMemAlloc(&d_res, fsize)); errcheck(cuMemcpyHtoD(d_res, &r->data, fsize)); args[0] = &d_res; args[argcv[0]+1] = &size; int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; char name[256]; strcpy(name, "block"); strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); errcheck(cuMemcpyDtoH(&r->data, d_res, fsize)); cuMemFree(d_res); for (int i=0; i<argcv[0]; i++) cuMemFree(dps[i]); return (Object)r; }
CUresult loadCUDAModules() { CUmodule cuModule_; checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx")); checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess")); }
static CUresult initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, int argc, char** argv, const char* cubin_name, const char* kernel_name) { CUdevice cuDevice; CUT_DEVICE_INIT_DRV(cuDevice, argc, argv); print_GetProperties(cuDevice); CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice ); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf("(1) context creation successful\n"); char* module_path = cutFindFilePath(cubin_name, executablePath); printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path); if(module_path != NULL) printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3)); char* data_path = "./data/"; size_t len_path = strlen(data_path); size_t len_fn = strlen(cubin_name); // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn); char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn)); strcpy(module_path_new, data_path); strcat(module_path_new, cubin_name); strcat(module_path_new, "\0"); if (module_path_new == 0) { status = CUDA_ERROR_NOT_FOUND; Error(_cuContext, status); } FILE *fp = fopen(module_path_new,"r"); if( fp ) { printf("(2) cubin_File found in modulepath:%s\n", module_path_new); fclose(fp); } else { printf("(2) cubin file not exist: %s\n", module_path_new); } CUmodule cuModule; status = cuModuleLoad(&cuModule, module_path_new); cutFree(module_path_new); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf ("(3) module Load successful\n"); CUfunction cuFunction = 0; status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name); if ( CUDA_SUCCESS != status) { Error(_cuContext, status); } else printf ("(4) getFunction successful w/cuFunction\n"); *mathop = cuFunction; return CUDA_SUCCESS; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){ void * cubin_file; int offset; CUresult status; char * native_filename; heapEndPtr = heap_end_ptr; native_filename = (*env)->GetStringUTFChars(env, filename, 0); status = cuModuleLoad(&cuModule, native_filename); CHECK_STATUS(env, "error in cuModuleLoad", status); (*env)->ReleaseStringUTFChars(env, filename, native_filename); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
CUresult CuContext::LoadModuleFilename(const std::string& filename, ModulePtr* ppModule) { ModulePtr module(new CuModule); CUresult result = cuModuleLoad(&module->_module, filename.c_str()); HANDLE_RESULT(); module->_context = this; ppModule->swap(module); return CUDA_SUCCESS; }
/* * Initializaiton in order to use kernel program */ void init_cuda(void){ thread_num = (N <= 16) ? N : 16 ; block_num = N / (thread_num*thread_num); if(N % (thread_num*thread_num) != 0) block_num++; res = cuInit(0); if(res != CUDA_SUCCESS){ printf("cuInit failed: res = %s\n", conv(res)); exit(1); } res = cuDeviceGet(&dev, 0); if(res != CUDA_SUCCESS){ printf("cuDeviceGet failed: res = %s\n", conv(res)); exit(1); } res = cuCtxCreate(&ctx, 0, dev); if(res != CUDA_SUCCESS){ printf("cuCtxCreate failed: res = %s\n", conv(res)); exit(1); } res = cuModuleLoad(&module, "./cuda_main.cubin"); if(res != CUDA_SUCCESS){ printf("cuModuleLoad() failed: res = %s\n", conv(res)); exit(1); } res = cuModuleGetFunction(&function, module, "cuda_main"); if(res != CUDA_SUCCESS){ printf("cuModuleGetFunction() failed: res = %s\n", conv(res)); exit(1); } /* * preparation for launch kernel */ res = cuFuncSetSharedSize(function, 0x40); /* just random */ if(res != CUDA_SUCCESS){ printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res)); exit(1); } res = cuFuncSetBlockShape(function, thread_num, thread_num, 1); if(res != CUDA_SUCCESS){ printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res)); exit(1); } }
SEXP R_auto_cuModuleLoad(SEXP r_fname) { SEXP r_ans = R_NilValue; CUmodule module; const char * fname = CHAR(STRING_ELT(r_fname, 0)); CUresult ans; ans = cuModuleLoad(& module, fname); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(module, "CUmodule") ; return(r_ans); }
kernel_t<CUDA>* kernel_t<CUDA>::buildFromBinary(const std::string &filename, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoad(&data_.module, filename.c_str())); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream) { // Initialize the device and create the context cuInit(0); cuDeviceGet(phDevice, device); CUresult status = cuCtxCreate(phContext, 0, *phDevice); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not create context\n"; exit(0);} status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx"); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);} // Create stream status = cuStreamCreate(phStream, 0); if (status != CUDA_SUCCESS) {printf("ERROR: during stream creation\n"); exit(0);} return status; }
int main(int argc, char ** argv) { int dev_count = 0; CUdevice device; CUcontext context; CUmodule module; CUfunction function; cuInit(0); cuDeviceGetCount(&dev_count); if (dev_count < 1) return -1; cuDeviceGet( &device, 0 ); cuCtxCreate( &context, 0, device ); cuModuleLoad( &module, "hello.cuda_runtime.ptx" ); cuModuleGetFunction( &function, module, "_Z6kernelPf" ); int N = 512; CUdeviceptr pData; cuMemAlloc( &pData, N * sizeof(float) ); cuFuncSetBlockShape( function, N, 1, 1 ); cuParamSeti( function, 0, pData ); cuParamSetSize( function, 4 ); cuLaunchGrid( function, 1, 1 ); float * pHostData = new float[N]; cuMemcpyDtoH( pHostData, pData, N * sizeof( float) ); cuMemFree( pData ); delete [] pHostData; return 0; }
int madd_gpu_init(struct device_info *device_info) { char fname[256]; CUresult res; /* printf("madd_gpu_init called.\n"); */ /* Initialization */ if ((res = cuInit(0)) != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } /* binary files are located in the same directory as the source code */ if ((res = cuModuleLoad(&device_info->module, MODULE_FILE_NAME)) != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } if ((res = cuModuleGetFunction(&device_info->kernel, device_info->module, KERNEL_NAME)) != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } return 0; }
kernel_t<CUDA>* kernel_t<CUDA>::buildFromSource(const std::string &filename, const std::string &functionName_, const kernelInfo &info_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; kernelInfo info = info_; std::string cachedBinary = getCachedBinaryName(filename, info); struct stat buffer; const bool fileExists = (stat(cachedBinary.c_str(), &buffer) == 0); if(fileExists){ std::cout << "Found cached binary of [" << filename << "] in [" << cachedBinary << "]\n"; return buildFromBinary(cachedBinary, functionName); } if(!haveFile(cachedBinary)){ waitForFile(cachedBinary); return buildFromBinary(cachedBinary, functionName); } std::string iCachedBinary = createIntermediateSource(filename, cachedBinary, info); std::string libPath, soname; getFilePrefixAndName(cachedBinary, libPath, soname); std::string oCachedBinary = libPath + "o_" + soname + ".o"; std::string archSM = ""; if(dev->dHandle->compilerFlags.find("-arch=sm_") == std::string::npos){ std::stringstream archSM_; int major, minor; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Getting CUDA Device Arch", cuDeviceComputeCapability(&major, &minor, data_.device) ); archSM_ << " -arch=sm_" << major << minor << ' '; archSM = archSM_.str(); } std::stringstream command; //---[ PTX Check Command ]---------- if(dev->dHandle->compilerEnvScript.size()) command << dev->dHandle->compilerEnvScript << " && "; command << dev->dHandle->compiler << ' ' << dev->dHandle->compilerFlags << archSM << " -Xptxas -v,-dlcm=cg,-abi=no" << ' ' << info.flags << " -x cu -c " << iCachedBinary << " -o " << oCachedBinary; const std::string &ptxCommand = command.str(); std::cout << "Compiling [" << functionName << "]\n" << ptxCommand << "\n"; #if (OCCA_OS == LINUX_OS) || (OCCA_OS == OSX_OS) const int ptxError = system(ptxCommand.c_str()); #else const int ptxError = system(("\"" + ptxCommand + "\"").c_str()); #endif // Not needed here I guess // if(ptxError){ // releaseFile(cachedBinary); // throw 1; // } //---[ Compiling Command ]---------- command.str(""); command << dev->dHandle->compiler << " -o " << cachedBinary << " -ptx -I." << ' ' << dev->dHandle->compilerFlags << archSM << ' ' << info.flags << " -x cu " << iCachedBinary; const std::string &sCommand = command.str(); std::cout << sCommand << '\n'; const int compileError = system(sCommand.c_str()); if(compileError){ releaseFile(cachedBinary); throw 1; } const CUresult moduleLoadError = cuModuleLoad(&data_.module, cachedBinary.c_str()); if(moduleLoadError) releaseFile(cachedBinary); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", moduleLoadError); const CUresult moduleGetFunctionError = cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()); if(moduleGetFunctionError) releaseFile(cachedBinary); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", moduleGetFunctionError); releaseFile(cachedBinary); return this; }
Object cuda_using_do_blockWidth_blockHeight_gridWidth_gridHeight(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); // do through gridWidth only have one argument each int argOffset = argcv[0] + 1; int blockDimX = integerfromAny(argv[argOffset++]); int blockDimY = integerfromAny(argv[argOffset++]); int gridDimX = integerfromAny(argv[argOffset++]); int gridDimY = integerfromAny(argv[argOffset++]); char *tmp = grcstring(argv[argcv[0]]); char argStr[strlen(tmp) + 1]; strcpy(argStr, tmp); char *tmp2 = strtok(argStr, " "); char blockname[128]; strcpy(blockname, tmp2); errcheck(cuModuleLoad(&cuModule, blockname)); CUdeviceptr dps[argcv[0]]; float floats[argcv[0]]; void *args[argcv[0]]; int ints[argcv[0]]; argStr[strlen(blockname)] = ' '; strtok(argStr, " "); for (int i=0; i<argcv[0]; i++) { char *argType = strtok(NULL, " "); if (argType[0] == 'f' && argType[1] == '*') { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; errcheck(cuMemAlloc(&dps[i], a->size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, a->size * sizeof(float))); args[i] = &dps[i]; } else if (argType[0] == 'f') { floats[i] = (float)*((double *)(argv[i]->data)); args[i] = &floats[i]; } else if (argType[0] == 'i') { ints[i] = integerfromAny(argv[i]); args[i] = &ints[i]; } else { // Fail char buf[256]; sprintf(buf, "CUDA argument cannot be coerced. This shouldn't happen. Argument string: %s\n", argType); raiseError(buf); } } char name[256]; strcpy(name, "block"); strcat(name, blockname + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, gridDimX, gridDimY, 1, blockDimX, blockDimY, 1, 0, NULL, args, NULL)); for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; errcheck(cuMemcpyDtoH(&a->data, dps[i], a->size * sizeof(float))); cuMemFree(dps[i]); } return alloc_none(); }
void test_tasks(unsigned int size, int nr_tasks) { int i; pid_t pid; int status; CUresult res; CUdevice dev; CUcontext ctx; CUdeviceptr data_addr; CUmodule module; CUfunction function; unsigned int *in, *out; unsigned int n = size / 4; res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", res); exit(-1); } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", res); exit(-1); } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", res); exit(-1); } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); exit(-1); } in = (unsigned int *) malloc(size); out = (unsigned int *) malloc(size); res = cuMemcpyHtoD(data_addr, in, size); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); exit(-1); } #if 1 res = cuModuleLoad(&module, "./loop_gpu.cubin"); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); exit(-1); } res = cuModuleGetFunction(&function, module, "_Z4loopPjjj"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); exit(-1); } void *param1[] = {&data_addr, &size, &n}; //res = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, (void**)param1, 0); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel failed: res = %u\n", res); exit(-1); } //cuCtxSynchronize(); #endif if (--nr_tasks) { pid = fork(); if (pid == 0) { /* child */ test_tasks(size, nr_tasks); printf("Child finished\n"); exit(0); } else { /* parent */ waitpid(pid, &status, 0); } } #if 0 res = cuModuleLoad(&module, "./loop_gpu.cubin"); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); exit(-1); } res = cuModuleGetFunction(&function, module, "_Z4loopPjjj"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); exit(-1); } void *param1[] = {&data_addr, &size, &n}; res = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, (void**)param1, 0); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel failed: res = %u\n", res); exit(-1); } #endif res = cuMemcpyDtoH(out, data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH failed: res = %u\n", res); exit(-1); } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", res); exit(-1); } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", res); exit(-1); } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); exit(-1); } free(in); free(out); }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); count = (count > 2) ? 2 : count; CUdevice devices[count]; for (int i = 0; i < count; i++) CU_ERROR_CHECK(cuDeviceGet(&devices[i], i)); // Question 1: Can you create multiple contexts on the same device? { fprintf(stderr, "Attempting to create multiple contexts on each device...\n"); CUcontext contexts[count * N]; size_t j = 0; for (int i = 0; i < count; i++) { CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < N && error == CUDA_SUCCESS; k++) { error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]); if (error == CUDA_SUCCESS) CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++])); } fprintf(stderr, " created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error)); } CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < j && error == CUDA_SUCCESS; k++) error = cuCtxPushCurrent(contexts[k]); if (error == CUDA_SUCCESS) fprintf(stderr, " successfully pushed %zu contexts with cuCtxPushCurrent\n", k); else fprintf(stderr, " pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error)); for (size_t k = 0; k < j; k++) CU_ERROR_CHECK(cuCtxDestroy(contexts[k])); fprintf(stderr, "\n"); } CUcontext contexts[count][2]; for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) { CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j])); } } // Question 2: Can you access a host pointer in a different context from // which it was created? // Question 3: Can you free a host pointer in a different context from which // it was created? { void * hPtr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUdeviceptr dPtr[count]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 4: Can you access a device pointer in a different context from // which it was created? // Question 5: Can you free a device pointer in a different context from which // it was created? { CUdeviceptr dPtr[count][2]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0][1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1][0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 6: Can you access a module in a different context from which it // was loaded? // Question 7: Can you unload a module in a different context from which it // was loaded? { CUmodule module; CUdeviceptr ptr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuModuleLoad(&module, "kernel-test.ptx")); CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUfunction function = 0; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); CUdeviceptr a, b; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); void * params[] = { &a, & b }; CUresult error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemFree(a)); CU_ERROR_CHECK(cuMemFree(b)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j])); } return 0; }
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; unsigned int *a_buf, *b_buf, *c_buf; unsigned long long int a_phys, b_phys, c_phys; unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int)); int block_x, block_y, grid_x, grid_y; char fname[256]; int ret = 0; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; struct timeval tv_mem_alloc_start; struct timeval tv_data_init_start; float data_init; struct timeval tv_conf_kern_start; struct timeval tv_close_start; float mem_alloc; float exec; float init_gpu; float configure_kernel; float close_gpu; float data_read; unsigned int dummy_b, dummy_c; /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/madd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } gettimeofday(&tv_mem_alloc_start, NULL); /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (a) failed\n"); return -1; } res = cuMemGetPhysAddr(&a_phys, (void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (a) failed\n"); return -1; } /*printf("a[]: Physical Address 0x%llx\n", a_phys);*/ /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (b) failed\n"); return -1; } res = cuMemGetPhysAddr(&b_phys, (void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (b) failed\n"); return -1; } /*printf("b[]: Physical Address 0x%llx\n", b_phys);*/ /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (c) failed\n"); return -1; } res = cuMemGetPhysAddr(&c_phys, (void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (c) failed\n"); return -1; } /*printf("c[]: Physical Address 0x%llx\n", c_phys);*/ gettimeofday(&tv_data_init_start, NULL); /* initialize A[] & B[] */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { a_buf[idx++] = i; } } for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { b_buf[idx++] = i; } } gettimeofday(&tv_h2d_start, NULL); gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_conf_kern_start, NULL); /* set kernel parameters */ res = cuParamSeti(function, 0, a_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 4, a_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 8, b_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 12, b_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 16, c_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 20, c_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 24, n); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSetSize(function, 28); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ memcpy(c, c_buf, n*n*sizeof(unsigned int)); gettimeofday(&tv_d2h_end, NULL); /* Read back */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { dummy_c = c[idx++]; } } gettimeofday(&tv_close_start, NULL); res = cuMemUnmap((void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_mem_alloc_start, &tv_total_start, &tv); init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv); mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_start, &tv_data_init_start, &tv); data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_start, &tv_conf_kern_start, &tv); configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_close_start, &tv_d2h_end, &tv); data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_close_start, &tv); close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("Init: %f\n", init_gpu); printf("MemAlloc: %f\n", mem_alloc); printf("DataInit: %f\n", data_init); printf("HtoD: %f\n", h2d); printf("KernConf: %f\n", configure_kernel); printf("Exec: %f\n", exec); printf("DtoH: %f\n", d2h); printf("DataRead: %f\n", data_read); printf("Close: %f\n", close_gpu); printf("Total: %f\n", total); return ret; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
CudaModule::CudaModule(const std::string& cubinFile) { staticInit(); checkError("cuModuleLoad", cuModuleLoad(&m_module, cubinFile.c_str())); }
int main( int argc, char **argv ) { CUdevice main_device = 0; CUcontext main_context = nullptr; CUmodule mod_vectorAdd = nullptr; CUfunction fun_vectorAdd = nullptr; std::string path_vectorAdd( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.cu" ); std::string ptx_vectorAdd ( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.ptx" ); CUdeviceptr input_data = 0u; CUdeviceptr output_data = 0u; std::size_t problem_size = 1024; try { //Initialize the driver API check_error( cuInit( 0u ) ); { int device_count = 0u; check_error( cuDeviceGetCount( &device_count ) ); if( ! device_count ) { std::cerr << "No CUDA devices available" << std::endl; throw CUDA_ERROR_NO_DEVICE; } } check_error( cuDeviceGet( &main_device, 0 ) ); check_error( cuCtxCreate( &main_context, 0, main_device ) ); //Try to manually compile the source file { std::stringstream build_command; build_command << "nvcc " "-ptx " "-o " << ptx_vectorAdd << " " << path_vectorAdd; if( int build_status = system( build_command.str( ).c_str( ) ) ) { std::cerr << "Failed to compile source cuda file into a ptx assembly" << std::endl; throw CUDA_ERROR_UNKNOWN; } //Find module entry with assembly std::string str_assembly; { std::ifstream fassembly( ptx_vectorAdd ); if( !fassembly.is_open( ) ) { std::cerr << "'Vector Add' assembly unavailable" << std::endl; throw CUDA_ERROR_FILE_NOT_FOUND; } fassembly.seekg (0, std::ios::end); str_assembly.resize( std::string::size_type( fassembly.tellg() ) ); fassembly.seekg (0, std::ios::beg); fassembly.read( &str_assembly[0], str_assembly.size( ) ); fassembly.close( ); } auto entry_pos = str_assembly.find( ".entry" ); if( entry_pos == std::string::npos ) { std::cerr << "No entry point in 'Vector Add'" << std::endl; throw CUDA_ERROR_INVALID_SOURCE; } entry_pos += 6u; //".entry".size( ) auto search_limit = str_assembly.find_first_of( " (", entry_pos ); if( search_limit == std::string::npos ) { std::cerr << "No entry point in 'Vector Add'" << std::endl; throw CUDA_ERROR_INVALID_SOURCE; } std::string funcName( str_assembly.substr( entry_pos, search_limit ) ); check_error( cuModuleLoad ( &mod_vectorAdd, ptx_vectorAdd.c_str( ) ) ); check_error( cuModuleGetFunction ( &fun_vectorAdd, mod_vectorAdd, funcName.c_str( ) ) ); } //Play with buffer cuMemAlloc( &input_data, problem_size * sizeof( float ) ); cuMemAlloc( &output_data, problem_size * sizeof( float ) ); { int threadsPerBlock = 256; int blocksPerGrid = (problem_size + threadsPerBlock - 1) / threadsPerBlock; void* args[] = { &input_data, &output_data, &problem_size }; cuLaunchKernel( fun_vectorAdd, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, nullptr); } float* result = new float[problem_size]; cuMemcpyDtoH( result, output_data, problem_size * sizeof( float ) ); std::copy( result, result + problem_size, std::ostream_iterator<float>(std::cout, ", ") ); delete[] result; if( output_data ) cuMemFree ( output_data ); if( input_data ) cuMemFree ( input_data ); if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd ); if( main_context ) cuCtxDestroy ( main_context ); } catch( int return_code ) { if( output_data ) cuMemFree ( output_data ); if( input_data ) cuMemFree ( input_data ); if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd ); if( main_context ) cuCtxDestroy ( main_context ); system("PAUSE"); return return_code;
int cuda_test_fmadd(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmadd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
int main( int argc, char** argv) { uint num_threads; uint num_blocks, block_size; uint length; uint nBytes; int *list; int status, verbose, c, i, j, logBlocks; int read_stdin; struct timeval start_time, end_time; unsigned long total_time; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction bitonicBlockFn; CUfunction mergeBlocksFn; CUdeviceptr pDeviceArrayA; CUdeviceptr pDeviceArrayB; status = SUCCESS; verbose = 0; read_stdin = FALSE; length = 0; while ((c = getopt (argc, argv, "dip:vO")) != -1) { switch (c) { case 'd': verbose |= GROSS_DEBUG; break; case 'i': read_stdin = TRUE; case 'O': verbose |= OUTPUT; break; case 'p': length = 1 << atoi(optarg); break; case 'v': verbose |= DEBUG; break; case '?': default: print_usage(); return FAILURE; } } if ( read_stdin == TRUE ) { /* Read sequence of integers from stdin */ list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) ); length = readIntegers(list, INIT_INPUT_SIZE); } else if ( length > 0 ) { list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } else if (optind >= argc) { /* No size was given */ print_usage(); return FAILURE; } else { /* Generate our own integers */ length = atoi(argv[optind]); list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } /* * Phase 1: * There will be one thread for each element to be sorted. Each * block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements. */ /* Initialize sizes */ num_threads = _min(length, MAX_THREADS_PER_BLOCK ); num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1; nBytes = length * sizeof(int); if (verbose & DEBUG) printf("Initializing GPU.\n"); /* Start timing */ gettimeofday(&start_time, NULL); /* Initialize GPU */ cutilDrvSafeCall( cuInit(0) ); cutilDrvSafeCall( cuDeviceGet(&hDevice, 0) ); cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) ); cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) ); cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) ); /* Allocate memory on the device */ cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes) ); cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes) ); cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) ); cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1)); cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA) ); cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4) ); /* Execute the kernel on the GPU */ if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads); cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1) ); /* * Phase 2: * At this point each block is a sorted list. Now it's time to merge them. */ /* TODO This should go away after development */ if ( verbose & GROSS_DEBUG ) { cuMemcpyDtoH(list, pDeviceArrayA, nBytes); for (i=0; i<num_blocks; ++i) { printf("### Block %d:\n", i); for (j=0; j<num_threads; ++j) { printf("%d\n", list[i*num_threads + j]); } } } i=0; /* Do we need to merge blocks? */ if ( num_blocks > 1 ) { /* There will be Log_2(num_blocks) merge steps. */ logBlocks = 0; for (i=1; i<num_blocks; i *= 2) ++logBlocks; if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks); block_size = num_threads; /* How big the blocks were in the last grid launch. */ num_threads = num_blocks >> 1; /* Start with blocks/2 threads */ num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) ); cuParamSeti(mergeBlocksFn, 4, block_size); cuParamSetSize(mergeBlocksFn, 16); for (i=0; i < logBlocks; ++i) { cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1); cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */ /* Merging uses a source array and destination array, the gpu has 2 arrays allocated * so we swap which is the source and which is the destination for each iteration. */ if ( i%2 == 0 ) { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB); } else { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA); } if ( verbose & DEBUG ) { printf("Launching block merge kernel with %d blocks and %d threads per block\n", num_blocks, num_threads/num_blocks); } cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) ); num_threads = num_threads >> 1; num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; } }
// Host code int main() { int N = 3; size_t size = N * sizeof(float); float* h_A = (float*)malloc(size); float* h_B = (float*)malloc(size); float* h_C = (float*)malloc(size); // Set up vectors. for (int i = 0; i < N; ++i) { h_A[i] = i * 1.0; h_B[i] = i * 1.0 + 1; h_C[i] = 0; printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]); } // Initialize if (cuInit(0) != CUDA_SUCCESS) exit (0); // Get number of devices supporting CUDA int deviceCount = 0; cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit (0); } // Get handle for device 0 CUdevice cuDevice = 0; CUresult r1 = cuDeviceGet(&cuDevice, 0); // Create context CUcontext cuContext; cuCtxCreate(&cuContext, 0, cuDevice); // Create module from binary file CUmodule cuModule; CUresult r2 = cuModuleLoad(&cuModule, "VecAdd.ptx"); // Get function handle from module CUfunction vecAdd; CUresult r3 = cuModuleGetFunction(&vecAdd, cuModule, "VecAdd"); // Allocate vectors in device memory CUdeviceptr d_A; CUresult r4 = cuMemAlloc(&d_A, size); CUdeviceptr d_B; CUresult r5 = cuMemAlloc(&d_B, size); CUdeviceptr d_C; CUresult r6 = cuMemAlloc(&d_C, size); // Copy vectors from host memory to device memory // h_A and h_B are input vectors stored in host memory CUresult r7 = cuMemcpyHtoD(d_A, h_A, size); CUresult r8 = cuMemcpyHtoD(d_B, h_B, size); // Invoke kernel #define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) int offset = 0; void* ptr; ptr = (void*)(size_t)d_A; ALIGN_UP(offset, __alignof(ptr)); CUresult r9 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(size_t)d_B; ALIGN_UP(offset, __alignof(ptr)); CUresult r10 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(size_t)d_C; ALIGN_UP(offset, __alignof(ptr)); CUresult r11 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(int)N; ALIGN_UP(offset, __alignof(ptr)); CUresult r11a = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); CUresult r12 = cuParamSetSize(vecAdd, offset); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; CUresult r13 = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); CUresult r14 = cuLaunchGrid(vecAdd, blocksPerGrid, 1); // Copy result from device memory to host memory // h_C contains the result in host memory CUresult r15 = cuMemcpyDtoH(h_C, d_C, size); for (int i = 0; i < N; ++i) { printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]); } // Free device memory cuMemFree(d_A); cuMemFree(d_B); cuMemFree(d_C); }
CUDARunner::CUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA) { m_in=0; m_devin=0; m_out=0; m_devout=0; CUresult rval; int major=0; int minor=0; std::string cuda_module_path("bitcoinminercuda.ptx"); rval=cuInit(0); if(rval==CUDA_SUCCESS) { rval=cuDeviceGetCount(&m_devicecount); printf("%d CUDA GPU devices found\n",m_devicecount); if(m_devicecount>0) { if(m_deviceindex>=0 && m_deviceindex<m_devicecount) { printf("Setting CUDA device to device %d\n",m_deviceindex); rval=cuDeviceGet(&m_device,m_deviceindex); if(rval!=CUDA_SUCCESS) { exit(0); } } else { m_deviceindex=0; printf("Setting CUDA device to first device found\n"); rval=cuDeviceGet(&m_device,0); if(rval!=CUDA_SUCCESS) { exit(0); } } cuDeviceComputeCapability(&major, &minor, m_device); rval=cuCtxCreate(&m_context,CU_CTX_BLOCKING_SYNC,m_device); if(rval!=CUDA_SUCCESS) { printf("Unable to create CUDA context\n"); exit(0); } printf("Loading module %s\n",cuda_module_path.c_str()); rval=cuModuleLoad(&m_module,cuda_module_path.c_str()); if(rval!=CUDA_SUCCESS) { printf("Unable to load CUDA module: %i\n", rval); cuCtxDestroy(m_context); exit(0); } rval=cuModuleGetFunction(&m_function,m_module,"cuda_process"); if(rval!=CUDA_SUCCESS) { printf("Unable to get function cuda_process %d\n",rval); cuModuleUnload(m_module); cuCtxDestroy(m_context); exit(0); } printf("CUDA initialized\n"); } else { printf("No CUDA capable devices found\n"); exit(0); } } else { printf("Unable to initialize CUDA\n"); exit(0); } }
double CUDAImpl::Build(std::string * err) { _StartTimer(); if(!_UnloadModule(err)) { return GPUIP_ERROR; } const char * file_helper_math_h = ".helper_math.h"; const char * file_temp_cu = ".temp.cu"; const char * file_temp_ptx = ".temp.ptx"; // Includes vector float operations such as mult, add etc std::ofstream out_helper(file_helper_math_h); out_helper << get_cuda_helper_math(); out_helper.close(); // Create temporary file to compile std::ofstream out(file_temp_cu); out << "#include \"" << file_helper_math_h << "\"\n"; out << "extern \"C\" { \n"; // To avoid function name mangling for(size_t i = 0; i < _kernels.size(); ++i) { out << _kernels[i]->code << "\n"; } out << "}"; // End the extern C bracket out.close(); std::stringstream ss; const char * cuda_bin_path = getenv("CUDA_BIN_PATH"); if (cuda_bin_path != NULL) { ss << cuda_bin_path << "/nvcc"; } else { ss << "nvcc"; } ss << " -ptx " << file_temp_cu << " -o " << file_temp_ptx << " --Wno-deprecated-gpu-targets" << " -include " << file_helper_math_h; if(sizeof(void *) == 4) { ss << " -m32"; } else { ss << " -m64"; } #ifdef _WIN32 const char * cl_bin_path = getenv("CL_BIN_PATH"); if (cl_bin_path != NULL) { ss << " -ccbin \"" << cl_bin_path << "\""; } #endif ss << " 2>&1" << std::endl; // get both standard output and error std::string pipe_err; int nvcc_exit_status = _execPipe(ss.str().c_str(), &pipe_err); // Cleanup temp text file _removeFile(file_helper_math_h); _removeFile(file_temp_cu); if (nvcc_exit_status) { (*err) = "Cuda error: Could not compile kernels:\n"; (*err) += pipe_err; return GPUIP_ERROR; } // Load cuda ptx from file CUresult c_err = cuModuleLoad(&_cudaModule, ".temp.ptx"); _removeFile(file_temp_ptx); if (_cudaErrorLoadModule(c_err, err)) { return GPUIP_ERROR; } _cudaKernels.resize(_kernels.size()); for(size_t i = 0; i < _kernels.size(); ++i) { c_err = cuModuleGetFunction(&_cudaKernels[i], _cudaModule, _kernels[i]->name.c_str()); if (_cudaErrorGetFunction(c_err, err, _kernels[i]->name)) { return GPUIP_ERROR; } } _cudaBuild = true; return _StopTimer(); }
int main(int argc, char *argv[]) { argc--; argv++; // Instruction-level test of PTX assembly language and emulator. // This test should work natively and under emulation. Many of the // instructions tested here stress many poorly documented features // of the PTX assembly language. If the emulator passes these // tests, then it can surely pass code that is generated by the // nvcc compiler. test(cuInit(0), "cuInit"); int deviceCount = 0; test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount"); int device = 0; if (argc) device = atoi(*argv); CUdevice cuDevice = 0; test(cuDeviceGet(&cuDevice, device), "cuDeviceGet"); CUcontext cuContext; int xxx = cuCtxCreate(&cuContext, 0, cuDevice); CUmodule cuModule; test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad"); // Do basic test. No sense continuing if we cannot complete this // test. try { CUfunction proc; test(cuModuleGetFunction(&proc, cuModule, "InstBasic"), "cuModuleGetFunction"); bool * h_R = (bool*)malloc(sizeof(bool)); memset(h_R, 0, sizeof(bool)); CUdeviceptr d_R; test(cuMemAlloc(&d_R, sizeof(bool)), "cuMemAlloc"); test(cuMemcpyHtoD(d_R, h_R, sizeof(bool)), "cuMemcpyHtoD"); int offset = 0; void* ptr; ptr = (void*)(size_t)d_R; ALIGN_UP(offset, __alignof(ptr)); test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv"); offset += sizeof(ptr); test(cuParamSetSize(proc, offset), "cuParamSetSize"); int threadsPerBlock = 1; int blocksPerGrid = 1; test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape"); test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid"); test(cuMemcpyDtoH(h_R, d_R, sizeof(bool)), "cuMemcpyDtoH"); test(cuMemFree(d_R), "cuMemFree"); if (h_R[0] == 1) std::cout << "Basic test passed.\n"; else { std::cout << "Basic test failed.\n"; exit(1); } } catch (...) { test(1, "test crashed."); } // Do LD, ST, MOV test. load_and_test(cuModule, "InstLSMC"); // Do ADD, SUB test. load_and_test(cuModule, "InstAddSub"); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
void test_driver_api() { CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; size_t totalGlobalMem; CUfunction matrixMult = 0; // cuda driver api intialization { int major = 0, minor = 0; char deviceName[100]; cuda::Check::CUDAError(cuInit(0), "Error intializing cuda"); int deviceCount; cuda::Check::CUDAError(cuDeviceGetCount(&deviceCount), "Error getting the number of devices"); if (deviceCount <= 0) { std::cerr << "No devices found" << std::endl; return; } cuDeviceGet(&cuDevice, 0); // get compute capabilities and the devicename cuda::Check::CUDAError(cuDeviceComputeCapability(&major, &minor, cuDevice), "Error getting Device compute capability"); cuda::Check::CUDAError(cuDeviceGetName(deviceName, 256, cuDevice), "Error getting device name"); std::cout << "> GPU Device has SM " << major << "." << minor << " compute capability" << std::endl; cuda::Check::CUDAError(cuDeviceTotalMem(&totalGlobalMem, cuDevice), "Error getting totat global memory"); std::cout << " Total amount of global memory: " << (unsigned long long)totalGlobalMem << " bytes" << std::endl; std::string tmp = (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO"; std::cout << " 64-bit Memory Address: " << tmp << std::endl; cuda::Check::CUDAError(cuCtxCreate(&cuContext, 0, cuDevice), "Error creating the context"); } // Compile and get the function { std::string module_path = "MatrixMult.cubin"; std::cout << "> initCUDA loading module: " << module_path << std::endl; cuda::Check::CUDAError(cuModuleLoad(&cuModule, module_path.c_str()), "Error loading module"); cuda::Check::CUDAError(cuModuleGetFunction(&matrixMult, cuModule, "MatrixMultKernelSimpleDriverAPI"), "Error retrieving the function"); } // Call the kernel { int WIDTH = BLOCK_SIZE; int HEIGHT = BLOCK_SIZE; std::stringstream text; text << "CUDA Matrix Multiplication (" << WIDTH << "x" << WIDTH << ") Simple method Multiplication time"; HostMatrix<float> M(WIDTH, HEIGHT); M.fillWithRandomData(); //M.print(std::cout); HostMatrix<float> N(WIDTH, HEIGHT); N.fill_diagonal(2); //N.print(std::cout); HostMatrix<float> C(WIDTH, HEIGHT); { ScopedTimer t(text.str()); // allocate device memory CUdeviceptr d_M; cuda::Check::CUDAError(cuMemAlloc(&d_M, M.sizeInBytes()), "Error allocating memory"); CUdeviceptr d_N; cuda::Check::CUDAError(cuMemAlloc(&d_N, N.sizeInBytes()), "Error allocating memory"); // copy host memory to device cuda::Check::CUDAError(cuMemcpyHtoD(d_M, M, M.sizeInBytes()), "Error uploading memory to device"); cuda::Check::CUDAError(cuMemcpyHtoD(d_N, N, N.sizeInBytes()), "Error uploading memory to device"); // allocate device memory for result CUdeviceptr d_C; cuda::Check::CUDAError(cuMemAlloc(&d_C, C.sizeInBytes()), "Error allocating memory"); dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid(C.width_ / BLOCK_SIZE, C.height_ / BLOCK_SIZE, 1); void *args[6] = { &d_M, &d_N, &d_C, &WIDTH, &WIDTH, &WIDTH}; // new CUDA 4.0 Driver API Kernel launch call cuda::Check::CUDAError(cuLaunchKernel( matrixMult, // Selected kernel function grid.x, grid.y, grid.z, // grid config block.x, block.y, block.z, // block config 2 * BLOCK_SIZE*BLOCK_SIZE*sizeof(float), NULL, args, NULL), "Error executing Kernel"); cuda::Check::CUDAError(cuMemcpyDtoH((void *)C, d_C, C.sizeInBytes()),"Error downloading memory to host"); } C.print(std::cout); } cuCtxDestroy(cuContext); }