CUresult loadCUDAModules() { CUmodule cuModule_; checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx")); checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi")); checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess")); }
CUfunction CudaModule::getKernel(const std::string& name, int paramSize) { CUfunction kernel = NULL; cuModuleGetFunction(&kernel, m_module, name.c_str()); if (!kernel) { std::string funcName(std::string("__globfunc_") + name); cuModuleGetFunction( &kernel, m_module, funcName.c_str() ); } if (kernel) { checkError( "cuParamSetSize", cuParamSetSize(kernel, paramSize)); } return kernel; }
/* int is 64-bit for some reason... */ CUresult bpnn_adjust_weights_launch (CUmodule mod, CUdeviceptr delta, long hid, CUdeviceptr ly, long in, CUdeviceptr w, CUdeviceptr oldw) { int bdx, bdy, gdx, gdy; void* param[] = {&delta, &hid, &ly, &in, &w, &oldw}; CUfunction f; CUresult res; bdx = 16; bdy = 16; gdx = 1; gdy = num_blocks; /* get functions. */ res = cuModuleGetFunction(&f, mod, "_Z24bpnn_adjust_weights_cudaPfiS_iS_S_"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction(adjust_weights) failed: res = %u\n", res); return res; } res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0, 0, (void**) param, 0); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(adjust_weights) failed: res = %u\n", res); return res; } return CUDA_SUCCESS; }
CUresult CreateCuFunction(const char* name, CuModule* module, int3 blockShape, FunctionPtr* ppFunction) { CUfunction func; CUresult result = cuModuleGetFunction(&func, module->Handle(), name); if(CUDA_SUCCESS != result) return result; FunctionPtr f(new CuFunction); CuFuncAttr& attr = f->_attributes; cuFuncGetAttribute(&attr.maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func); cuFuncGetAttribute(&attr.sharedSizeBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func); cuFuncGetAttribute(&attr.constSizeBytes, CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, func); cuFuncGetAttribute(&attr.localSizeBytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func); cuFuncGetAttribute(&attr.numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, func); cuFuncGetAttribute(&attr.ptxVersion, CU_FUNC_ATTRIBUTE_PTX_VERSION, func); cuFuncGetAttribute(&attr.binaryVersion, CU_FUNC_ATTRIBUTE_BINARY_VERSION, func); f->_function = func; f->_module = module; f->_functionName = name; f->_blockShape = blockShape; ppFunction->swap(f); return CUDA_SUCCESS; }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel(.param .u64 kernel_param_0) {\n" ".reg .s32 %r<2>;\n" ".reg .s64 %rd<3>;\n" "bra BB1_2;\n" "ld.param.u64 %rd1, [kernel_param_0];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "mov.u32 %r1, 5;\n" "st.global.u32 [%rd2], %r1;\n" "BB1_2: ret;\n" "}\n"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue))); void * params[] = {&devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue))); assert(hostValue == 10); std::cout << hostValue << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
WEAK int halide_dev_run(void *user_context, void *state_ptr, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n", user_context, entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); CUresult err; CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, state_ptr); CUmodule mod = ((module_state*)state_ptr)->module; halide_assert(user_context, mod); CUfunction f; err = cuModuleGetFunction(&f, mod, entry_name); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)", _get_error_name(err)); return err; } err = cuLaunchKernel(f, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, NULL, // stream args, NULL); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n", _get_error_name(err)); return err; } uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
CUresult bpnn_layerforward_launch (CUmodule mod, CUdeviceptr input_cuda, CUdeviceptr output_hidden_cuda, CUdeviceptr input_hidden_cuda, CUdeviceptr hidden_partial_sum, int in, int hid) { int bdx, bdy, gdx, gdy; void* param[] = {&input_cuda, &output_hidden_cuda, &input_hidden_cuda, &hidden_partial_sum, &in, &hid}; CUfunction f; CUresult res; bdx = 16; bdy = 16; gdx = 1; gdy = num_blocks; /* get functions. */ res = cuModuleGetFunction(&f, mod, "_Z22bpnn_layerforward_CUDAPfS_S_S_ii"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction(layerforward) failed: res = %u\n", res); return res; } res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0, 0, (void**) param, 0); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(layerforward) failed: res = %u\n", res); return res; } return CUDA_SUCCESS; }
Object cuda_over_map(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_res; errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]]))); CUdeviceptr dps[argcv[0]]; void *args[argcv[0]+2]; int size = INT_MAX; for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; if (a->size < size) size = a->size; errcheck(cuMemAlloc(&dps[i], size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float))); args[i+1] = &dps[i]; } struct CudaFloatArray *r = (struct CudaFloatArray *)(alloc_CudaFloatArray(size)); int fsize = sizeof(float) * size; errcheck(cuMemAlloc(&d_res, fsize)); errcheck(cuMemcpyHtoD(d_res, &r->data, fsize)); args[0] = &d_res; args[argcv[0]+1] = &size; int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; char name[256]; strcpy(name, "block"); strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); errcheck(cuMemcpyDtoH(&r->data, d_res, fsize)); cuMemFree(d_res); for (int i=0; i<argcv[0]; i++) cuMemFree(dps[i]); return (Object)r; }
static CUresult initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, int argc, char** argv, const char* cubin_name, const char* kernel_name) { CUdevice cuDevice; CUT_DEVICE_INIT_DRV(cuDevice, argc, argv); print_GetProperties(cuDevice); CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice ); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf("(1) context creation successful\n"); char* module_path = cutFindFilePath(cubin_name, executablePath); printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path); if(module_path != NULL) printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3)); char* data_path = "./data/"; size_t len_path = strlen(data_path); size_t len_fn = strlen(cubin_name); // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn); char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn)); strcpy(module_path_new, data_path); strcat(module_path_new, cubin_name); strcat(module_path_new, "\0"); if (module_path_new == 0) { status = CUDA_ERROR_NOT_FOUND; Error(_cuContext, status); } FILE *fp = fopen(module_path_new,"r"); if( fp ) { printf("(2) cubin_File found in modulepath:%s\n", module_path_new); fclose(fp); } else { printf("(2) cubin file not exist: %s\n", module_path_new); } CUmodule cuModule; status = cuModuleLoad(&cuModule, module_path_new); cutFree(module_path_new); if ( CUDA_SUCCESS != status ) { Error(_cuContext, status); } else printf ("(3) module Load successful\n"); CUfunction cuFunction = 0; status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name); if ( CUDA_SUCCESS != status) { Error(_cuContext, status); } else printf ("(4) getFunction successful w/cuFunction\n"); *mathop = cuFunction; return CUDA_SUCCESS; }
/* * This function load the ptx file ptxPath and extract the kernel kName * to phKernel * @param phKernel Output kernel handle * @param ptxPath ptx file name * @param kName kernel name */ void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName) { CUlinkState cuLinkState; CUjit_option options[6]; void *optionVals[6]; float walltime; char error_log[8192], info_log[8192]; unsigned int logSize = 8192; void *cuOut; size_t outSize; int myErr = 0; // Setup linker options // Return walltime from JIT compilation options[0] = CU_JIT_WALL_TIME; optionVals[0] = (void *) &walltime; // Pass a buffer for info messages options[1] = CU_JIT_INFO_LOG_BUFFER; optionVals[1] = (void *) info_log; // Pass the size of the info buffer options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optionVals[2] = (void *) (long)logSize; // Pass a buffer for error message options[3] = CU_JIT_ERROR_LOG_BUFFER; optionVals[3] = (void *) error_log; // Pass the size of the error buffer options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optionVals[4] = (void *) (long) logSize; // Make the linker verbose options[5] = CU_JIT_LOG_VERBOSE; optionVals[5] = (void *) 1; // Create a pending linker invocation checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState)); // Load the ptx from the file myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0); if (myErr != CUDA_SUCCESS){ // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); } // Complete the linker step checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize)); // Linker walltime and info_log were requested in options above. printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log); // Load resulting cuBin into module checkCudaErrors(cuModuleLoadData(phModule, cuOut)); // Locate the kernel entry point checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName)); // Destroy the linker invocation checkCudaErrors(cuLinkDestroy(cuLinkState)); }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){ void * cubin_file; int offset; CUresult status; char * native_filename; heapEndPtr = heap_end_ptr; native_filename = (*env)->GetStringUTFChars(env, filename, 0); status = cuModuleLoad(&cuModule, native_filename); CHECK_STATUS(env, "error in cuModuleLoad", status); (*env)->ReleaseStringUTFChars(env, filename, native_filename); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, std::function<size_t(size_t)> smem ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, std::function<size_t(size_t)> smem, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
CAMLprim value spoc_cuda_debug_compile(value moduleSrc, value function_name, value gi){ CAMLparam3(moduleSrc, function_name, gi); CUmodule module; CUfunction *kernel; char* functionN; char *ptx_source; const unsigned int jitNumOptions = 4; CUjit_option jitOptions[4]; void *jitOptVals[4]; int jitLogBufferSize; char *jitLogBuffer; int jitRegCount = 32; BLOCKING_CUDA_GET_CONTEXT; kernel = malloc(sizeof(CUfunction)); functionN = String_val(function_name); ptx_source = String_val(moduleSrc); // set up size of compilation log buffer jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; jitLogBufferSize = 1024; jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize); jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[2] = CU_JIT_MAX_REGISTERS; jitOptVals[2] = (void *)(size_t)jitRegCount; jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT; //CU_JIT_TARGET; // jitOptVals[3] = (void*)(uintptr_t)CU_TARGET_COMPUTE_10; cuda_error = (cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals)); if (cuda_error) { fprintf (stderr,"%s\n", jitLogBuffer); fflush (stderr); } cuda_error = (cuModuleGetFunction(kernel, module, functionN)); if (cuda_error) { fprintf (stderr, "%s\n", jitLogBuffer); fflush (stderr); } BLOCKING_CUDA_RESTORE_CONTEXT; free(jitLogBuffer); CAMLreturn((value) kernel); }
/* * Initializaiton in order to use kernel program */ void init_cuda(void){ thread_num = (N <= 16) ? N : 16 ; block_num = N / (thread_num*thread_num); if(N % (thread_num*thread_num) != 0) block_num++; res = cuInit(0); if(res != CUDA_SUCCESS){ printf("cuInit failed: res = %s\n", conv(res)); exit(1); } res = cuDeviceGet(&dev, 0); if(res != CUDA_SUCCESS){ printf("cuDeviceGet failed: res = %s\n", conv(res)); exit(1); } res = cuCtxCreate(&ctx, 0, dev); if(res != CUDA_SUCCESS){ printf("cuCtxCreate failed: res = %s\n", conv(res)); exit(1); } res = cuModuleLoad(&module, "./cuda_main.cubin"); if(res != CUDA_SUCCESS){ printf("cuModuleLoad() failed: res = %s\n", conv(res)); exit(1); } res = cuModuleGetFunction(&function, module, "cuda_main"); if(res != CUDA_SUCCESS){ printf("cuModuleGetFunction() failed: res = %s\n", conv(res)); exit(1); } /* * preparation for launch kernel */ res = cuFuncSetSharedSize(function, 0x40); /* just random */ if(res != CUDA_SUCCESS){ printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res)); exit(1); } res = cuFuncSetBlockShape(function, thread_num, thread_num, 1); if(res != CUDA_SUCCESS){ printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res)); exit(1); } }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, jint total_size, jint num_blocks){ void * cubin_file; int offset; CUresult status; heapEndPtr = heap_end_ptr; //void * cubin_file = readCubinFile("code_file.cubin"); cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size); status = cuModuleLoadData(&cuModule, cubin_file); CHECK_STATUS(env,"error in cuModuleLoad",status) free(cubin_file); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, size_t smem_per_thread = 0 ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
CUresult loadAndRunDualTestFunction(CUmodule *phModule, std::string name, CUdeviceptr &d_data0, CUdeviceptr &d_data1, DataStruct *h_data0, DataStruct *h_data1, unsigned int memSize, int thread_x=1,int thread_y=1,int thread_z=1, int block_x=1, int block_y=1, int block_z=1) { // std::cout << " Start Loading" << std::endl; // load data the to device cuMemcpyHtoD(d_data0, h_data0, memSize); cuMemcpyHtoD(d_data1, h_data1, memSize); // Locate the kernel entry point CUfunction phKernel = 0; CUresult status = cuModuleGetFunction(&phKernel, *phModule, name.data()); if (status != CUDA_SUCCESS) {printf("ERROR: could not load function\n");} // Set the kernel parameters status = cuFuncSetBlockShape(phKernel, thread_x, thread_y, thread_z); if (status != CUDA_SUCCESS) {printf("ERROR: during setBlockShape\n");} int paramOffset = 0, size=0; size = sizeof(CUdeviceptr); status = cuParamSetv(phKernel, paramOffset, &d_data0, size); paramOffset += size; status = cuParamSetv(phKernel, paramOffset, &d_data1, size); paramOffset += size; status = cuParamSetSize(phKernel, paramOffset); if (status != CUDA_SUCCESS) {printf("ERROR: during cuParamSetv\n");} // Launch the kernel status = cuLaunchGrid(phKernel, block_x, block_y); if (status != CUDA_SUCCESS) {printf("ERROR: during grid launch\n");} // std::cout << " launched CUDA kernel!!" << std::endl; // Copy the result back to the host status = cuMemcpyDtoH(h_data0, d_data0, memSize); status = cuMemcpyDtoH(h_data1, d_data1, memSize); if (status != CUDA_SUCCESS) {printf("ERROR: during MemcpyDtoH\n");} }
CUresult compute_tran_temp (CUmodule mod, CUdeviceptr MatrixPower, CUdeviceptr MatrixTemp[2], int col, int row, int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows) { int gdx = blockCols; int gdy = blockRows; int bdx = BLOCK_SIZE; int bdy = BLOCK_SIZE; float grid_height = chip_height / row; float grid_width = chip_width / col; float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); float Rz = t_chip / (K_SI * grid_height * grid_width); float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); float step = PRECISION / max_slope; float t; float time_elapsed; time_elapsed=0.001; int src = 1, dst = 0; CUfunction f; CUresult res; res = cuModuleGetFunction(&f, mod, "_Z14calculate_tempiPfS_S_iiiiffffff"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction failed: res = %u\n", res); return 0; } for (t = 0; t < total_iterations; t+=num_iterations) { int it = MIN(num_iterations, total_iterations-t); int temp = src; src = dst; dst = temp; void *param[] = {&it, &MatrixPower, &MatrixTemp[src], &MatrixTemp[dst], &col, &row, &borderCols, &borderRows, &Cap, &Rx, &Ry, &Rz, &step, &time_elapsed}; res = cuLaunchKernel(f, gdx, gdy, 1, bdx, bdy, 1, 0xc00, 0, (void**) param, NULL); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(euclid) failed: res = %u\n", res); return 0; } } return dst; }
GpuCompilationContext::GpuCompilationContext(const void* image, const std::string& kernel_name, const int device_id, const void* cuda_mgr, unsigned int num_options, CUjit_option* options, void** option_vals) : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) { static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_); checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals)); CHECK(module_); checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str())); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, size_t smem_per_thread = 0, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
static CUfunction __get_kernel(void *user_context, const char* entry_name) { CUfunction f; #ifdef DEBUG char msg[256]; snprintf(msg, 256, "get_kernel %s (t=%lld)", entry_name, (long long)halide_current_time_ns(user_context) ); #endif // Get kernel function ptr TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg ); return f; }
SEXP R_auto_cuModuleGetFunction(SEXP r_hfunc, SEXP r_hmod, SEXP r_name) { SEXP r_ans = R_NilValue; CUfunction * hfunc = GET_REF(r_hfunc, CUfunction ); CUmodule hmod = (CUmodule) getRReference(r_hmod); const char * name = CHAR(STRING_ELT(r_name, 0)); CUresult ans; ans = cuModuleGetFunction(hfunc, hmod, name); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
kernel_t<CUDA>* kernel_t<CUDA>::buildFromBinary(const std::string &filename, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoad(&data_.module, filename.c_str())); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
kernel_t<CUDA>* kernel_t<CUDA>::loadFromLibrary(const char *cache, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoadData(&data_.module, cache)); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
static CUfunction __get_kernel(const char* entry_name) { CUfunction f; #ifndef NDEBUG char msg[256]; snprintf(msg, 256, "get_kernel %s (t=%d)", entry_name, halide_current_time() ); #endif // Get kernel function ptr TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg ); return f; }
GPUFunction GPUInterface::GetFunction(const char* functionName) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::GetFunction\n"); #endif GPUFunction cudaFunction; SAFE_CUPP(cuModuleGetFunction(&cudaFunction, cudaModule, functionName)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::GetFunction\n"); #endif return cudaFunction; }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }