void swanLoadProgramFromSource( const char *module, const unsigned char *ptx, size_t len , int devtype ) { int i=0; CUresult err; try_init(); // let's see whether this module is already loaded for( i=0; i < state.num_mods; i++ ) { if( !strcmp( state.mod_names[i], module ) ) { return; // already loaded } } if( ptx == NULL || len == 0 ) { fprintf ( stderr, "SWAN : Module load failure [%s]. No source \n", module ); error( "Module source invalid" ); } i = state.num_mods; state.num_mods++; state.mods = (CUmodule*) realloc( state.mods, state.num_mods * sizeof(CUmodule) ); state.mod_names = (char**) realloc( state.mod_names, state.num_mods * sizeof(char*) ); state.mod_names[i] = (char*) malloc( strlen( module ) + 1 ); strcpy( state.mod_names[i], module ); // now load the PTX into a module err = cuModuleLoadData( &state.mods[i], ptx ); if( err != CUDA_SUCCESS ) { fprintf ( stderr, "SWAN : Module load result [%s] [%d]\n", module, err ); error( "Module load failed\n" ); } }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel(.param .u64 kernel_param_0) {\n" ".reg .s32 %r<2>;\n" ".reg .s64 %rd<3>;\n" "bra BB1_2;\n" "ld.param.u64 %rd1, [kernel_param_0];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "mov.u32 %r1, 5;\n" "st.global.u32 [%rd2], %r1;\n" "BB1_2: ret;\n" "}\n"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue))); void * params[] = {&devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue))); assert(hostValue == 10); std::cout << hostValue << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
/* * This function load the ptx file ptxPath and extract the kernel kName * to phKernel * @param phKernel Output kernel handle * @param ptxPath ptx file name * @param kName kernel name */ void ptxJIT(CUmodule *phModule, CUfunction *phKernel, const char *ptxPath, const char *kName) { CUlinkState cuLinkState; CUjit_option options[6]; void *optionVals[6]; float walltime; char error_log[8192], info_log[8192]; unsigned int logSize = 8192; void *cuOut; size_t outSize; int myErr = 0; // Setup linker options // Return walltime from JIT compilation options[0] = CU_JIT_WALL_TIME; optionVals[0] = (void *) &walltime; // Pass a buffer for info messages options[1] = CU_JIT_INFO_LOG_BUFFER; optionVals[1] = (void *) info_log; // Pass the size of the info buffer options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optionVals[2] = (void *) (long)logSize; // Pass a buffer for error message options[3] = CU_JIT_ERROR_LOG_BUFFER; optionVals[3] = (void *) error_log; // Pass the size of the error buffer options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optionVals[4] = (void *) (long) logSize; // Make the linker verbose options[5] = CU_JIT_LOG_VERBOSE; optionVals[5] = (void *) 1; // Create a pending linker invocation checkCudaErrors(cuLinkCreate(6,options, optionVals, &cuLinkState)); // Load the ptx from the file myErr = cuLinkAddFile(cuLinkState, CU_JIT_INPUT_PTX, ptxPath, 0, 0, 0); if (myErr != CUDA_SUCCESS){ // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); } // Complete the linker step checkCudaErrors(cuLinkComplete(cuLinkState, &cuOut, &outSize)); // Linker walltime and info_log were requested in options above. printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log); // Load resulting cuBin into module checkCudaErrors(cuModuleLoadData(phModule, cuOut)); // Locate the kernel entry point checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, kName)); // Destroy the linker invocation checkCudaErrors(cuLinkDestroy(cuLinkState)); }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, jint total_size, jint num_blocks){ void * cubin_file; int offset; CUresult status; heapEndPtr = heap_end_ptr; //void * cubin_file = readCubinFile("code_file.cubin"); cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size); status = cuModuleLoadData(&cuModule, cubin_file); CHECK_STATUS(env,"error in cuModuleLoad",status) free(cubin_file); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
SEXP R_auto_cuModuleLoadData(SEXP r_image) { SEXP r_ans = R_NilValue; CUmodule module; const void * image = GET_REF(r_image, const void ); CUresult ans; ans = cuModuleLoadData(& module, image); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(module, "CUmodule") ; return(r_ans); }
kernel_t<CUDA>* kernel_t<CUDA>::loadFromLibrary(const char *cache, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoadData(&data_.module, cache)); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) { DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n", user_context, state_ptr, ptx_src, size ); CudaContext ctx(user_context); if (ctx.error != 0) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif // Create the state object if necessary. This only happens once, regardless // of how many times halide_init_kernels/halide_release is called. // halide_release traverses this list and releases the module objects, but // it does not modify the list nodes created/inserted here. module_state **state = (module_state**)state_ptr; if (!(*state)) { *state = (module_state*)malloc(sizeof(module_state)); (*state)->module = NULL; (*state)->next = state_list; state_list = *state; } // Create the module itself if necessary. if (!(*state)->module) { DEBUG_PRINTF( user_context, " cuModuleLoadData %p, %i -> ", ptx_src, size ); CUmodule module; CUresult err = cuModuleLoadData(&(*state)->module, ptx_src); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) ); halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", module ); } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
int main(){ init_test(); const std::string test_source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry _Z6kernelPfi(\n" ".param .u64 _Z6kernelPfi_param_0,\n" ".param .u32 _Z6kernelPfi_param_1){\n" ".reg .pred %p<2>;\n" ".reg .f32 %f<3>;\n" ".reg .s32 %r<3>;\n" ".reg .s64 %rd<5>;\n" "ld.param.u64 %rd1, [_Z6kernelPfi_param_0];\n" "ld.param.u32 %r2, [_Z6kernelPfi_param_1];\n" "mov.u32 %r1, %tid.x;\n" "setp.ge.u32 %p1, %r1, %r2;\n" "@%p1 bra BB0_2;\n" "cvta.to.global.u64 %rd2, %rd1;\n" "cvt.rn.f32.u32 %f1, %r1;\n" "mul.f32 %f2, %f1, 0f3FC00000;\n" "mul.wide.u32 %rd3, %r1, 4;\n" "add.s64 %rd4, %rd2, %rd3;\n" "st.global.f32 [%rd4], %f2;\n" "BB0_2:\n" "ret;\n" "}"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi")); CUdeviceptr devArray; int size = 10; float hostArray[size]; memset(hostArray, 0, size * sizeof(hostArray[0])); cu_assert(cuMemAlloc(&devArray, sizeof(float) * size)); void * params[] = {&devArray, &size}; auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size)); cu_assert(cuMemFree(devArray)); cu_assert(cuModuleUnload(modId)); for (int i=0 ; i<size ; ++i) std::cout << hostArray[i] << '\n'; return 0; }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice)); } else { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice)); } if (kernelMap == NULL) { // kernels have not yet been initialized; do so now. Hopefully, this only occurs once per library load. InitializeKernelMap(); } int id = paddedStateCount; if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) { id *= -1; } if (kernelMap->count(id) == 0) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource = (*kernelMap)[id].copy(); kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel_4(\n" ".param .u32 kernel_4_param_0,\n" ".param .u64 kernel_4_param_1\n" ")\n" "{\n" ".reg .s32 %r<3>;\n" ".reg .s64 %rd<3>;\n" "ld.param.u32 %r1, [kernel_4_param_0];\n" "ld.param.u64 %rd1, [kernel_4_param_1];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "add.s32 %r2, %r1, 7;\n" "st.global.u32 [%rd2], %r2;\n" "ret;\n" "}"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel_4")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); void * params[] = {&hostValue, &devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); int result = 0; cu_assert(cuMemcpyDtoH(&result, devValue, sizeof(result))); assert(result == hostValue + 7); std::cout << result << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
WEAK void halide_init_kernels(const char* ptx_src) { // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context. if (cuda_ctx_ptr == NULL) { cuda_ctx_ptr = &weak_cuda_ctx; } // Initialize one shared context for all Halide compiled instances if (*cuda_ctx_ptr == 0) { // Initialize CUDA CHECK_CALL( cuInit(0), "cuInit" ); // Make sure we have a device int deviceCount = 0; CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" ); assert(deviceCount > 0); char *device_str = getenv("HL_GPU_DEVICE"); CUdevice dev; // Get device CUresult status; if (device_str) { status = cuDeviceGet(&dev, atoi(device_str)); } else { for (int id = 2; id >= 0; id--) { // Try to get a device >0 first, since 0 should be our display device status = cuDeviceGet(&dev, id); if (status == CUDA_SUCCESS) break; } } if (status != CUDA_SUCCESS) { fprintf(stderr, "Failed to get device\n"); exit(-1); } #ifndef NDEBUG fprintf(stderr, "Got device %d, about to create context (t=%d)\n", dev, halide_current_time()); #endif // Create context CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" ); } else { //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" ); } // Initialize a module for just this Halide module if (!__mod) { // Create module CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" ); #ifndef NDEBUG fprintf(stderr, "-------\nCompiling PTX:\n%s\n--------\n", ptx_src); #endif } // Create two events for timing if (!__start) { cuEventCreate(&__start, 0); cuEventCreate(&__end, 0); } }
static av_cold int cudascale_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; AVFilterLink *inlink = outlink->src->inputs[0]; CUDAScaleContext *s = ctx->priv; AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; int w, h; int ret; extern char vf_scale_cuda_ptx[]; ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); if (ret < 0) goto fail; ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx)); if (ret < 0) goto fail; CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex")); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuCtxPopCurrent(&dummy)); if ((ret = ff_scale_eval_dimensions(s, s->w_expr, s->h_expr, inlink, outlink, &w, &h)) < 0) goto fail; if (((int64_t)h * inlink->w) > INT_MAX || ((int64_t)w * inlink->h) > INT_MAX) av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n"); outlink->w = w; outlink->h = h; ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h); if (ret < 0) return ret; av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n", inlink->w, inlink->h, outlink->w, outlink->h); if (inlink->sample_aspect_ratio.num) { outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, outlink->w*inlink->h}, inlink->sample_aspect_ratio); } else { outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; } return 0; fail: return ret; }
//------------------------------------------------------------------------------ void build(CUmodule& module, CUfunction& kernel, const std::vector< std::string >& files, const char* kernel_name) { CUjit_option options[] = {CU_JIT_WALL_TIME, CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_LOG_VERBOSE}; float walltime = 0.f; const unsigned bufsize = 0x10000; char error_buf[bufsize] = ""; char log_buf[bufsize] = ""; const int verbose = 1; void* option_values[] = {(void*) &walltime, (void*) log_buf, (void*) bufsize, (void*) error_buf, (void*) bufsize, (void*) verbose}; void* compiled_code = 0; size_t compiled_size = 0; int status = CUDA_SUCCESS - 1; CUlinkState link_state = CUlinkState(); const int num_options = sizeof(options) / sizeof(CUjit_option); // Create a pending linker invocation CCHECK(cuLinkCreate(num_options, options, option_values, &link_state)); for(std::vector< std::string >::const_iterator i = files.begin(); i != files.end(); ++i) { status = cuLinkAddFile(link_state, CU_JIT_INPUT_PTX, i->c_str(), 0, //num options 0, //options, 0); //option values } if( status != CUDA_SUCCESS ) { std::cerr << "PTX Linker Error:\n"<< error_buf << std::endl; exit(EXIT_FAILURE); } // Complete the linker step: compiled_code is filled with executable code //???: what do I do with the returned data ? can/should I delete it ? CCHECK(cuLinkComplete(link_state, &compiled_code, &compiled_size)); assert(compiled_size > 0); assert(compiled_code); std::cout << "CUDA Link Completed in " << walltime << " ms\n" << log_buf << std::endl; CCHECK(cuModuleLoadData(&module, compiled_code)); CCHECK(cuModuleGetFunction(&kernel, module, kernel_name)); CCHECK(cuLinkDestroy(link_state)); }
static void link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, unsigned num_objs) { CUjit_option opts[7]; void *optvals[7]; float elapsed = 0.0; #define LOGSIZE 8192 char elog[LOGSIZE]; char ilog[LOGSIZE]; unsigned long logsize = LOGSIZE; CUlinkState linkstate; CUresult r; void *linkout; size_t linkoutsize __attribute__ ((unused)); opts[0] = CU_JIT_WALL_TIME; optvals[0] = &elapsed; opts[1] = CU_JIT_INFO_LOG_BUFFER; optvals[1] = &ilog[0]; opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optvals[2] = (void *) logsize; opts[3] = CU_JIT_ERROR_LOG_BUFFER; optvals[3] = &elog[0]; opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optvals[4] = (void *) logsize; opts[5] = CU_JIT_LOG_VERBOSE; optvals[5] = (void *) 1; opts[6] = CU_JIT_TARGET; optvals[6] = (void *) CU_TARGET_COMPUTE_30; r = cuLinkCreate (7, opts, optvals, &linkstate); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); for (; num_objs--; ptx_objs++) { /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, ptx_objs->size, 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); } } GOMP_PLUGIN_debug (0, "Linking\n"); r = cuLinkComplete (linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); r = cuModuleLoadData (module, linkout); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); r = cuLinkDestroy (linkstate); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r)); }
static CUmodule build_kernel_source(const char *source_file, long target_capability) { char *source; int link_dev_runtime; nvrtcProgram program; nvrtcResult rc; char arch_buf[128]; const char *options[10]; int opt_index = 0; int build_failure = 0; char *build_log; size_t build_log_len; char *ptx_image; size_t ptx_image_len; void *bin_image; size_t bin_image_len; CUmodule cuda_module; CUresult cuda_rc; source = load_kernel_source(source_file, &link_dev_runtime); rc = nvrtcCreateProgram(&program, source, NULL, 0, NULL, NULL); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcCreateProgram"); /* * Put command line options as cuda_program.c doing */ options[opt_index++] = "-I " CUDA_INCLUDE_PATH; snprintf(arch_buf, sizeof(arch_buf), "--gpu-architecture=compute_%ld", target_capability); options[opt_index++] = arch_buf; #ifdef PGSTROM_DEBUG options[opt_index++] = "--device-debug"; options[opt_index++] = "--generate-line-info"; #endif options[opt_index++] = "--use_fast_math"; if (link_dev_runtime) options[opt_index++] = "--relocatable-device-code=true"; /* * Kick runtime compiler */ rc = nvrtcCompileProgram(program, opt_index, options); if (rc != NVRTC_SUCCESS) { if (rc == NVRTC_ERROR_COMPILATION) build_failure = 1; else nvrtc_error(rc, "nvrtcCompileProgram"); } /* * Print build log */ rc = nvrtcGetProgramLogSize(program, &build_log_len); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetProgramLogSize"); build_log = malloc(build_log_len + 1); if (!build_log) { fputs("out of memory", stderr); exit(1); } rc = nvrtcGetProgramLog(program, build_log); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetProgramLog"); if (build_log_len > 1) printf("build log:\n%s\n", build_log); if (build_failure) exit(1); /* * Get PTX Image */ rc = nvrtcGetPTXSize(program, &ptx_image_len); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetPTXSize"); ptx_image = malloc(ptx_image_len + 1); if (!ptx_image) { fputs("out of memory", stderr); exit(1); } rc = nvrtcGetPTX(program, ptx_image); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetPTX"); ptx_image[ptx_image_len] = '\0'; /* * Link device runtime if needed */ if (link_dev_runtime) { link_device_libraries(ptx_image, ptx_image_len, &bin_image, &bin_image_len, target_capability); } else { bin_image = ptx_image; bin_image_len = ptx_image_len; } cuda_rc = cuModuleLoadData(&cuda_module, bin_image); if (cuda_rc != CUDA_SUCCESS) cuda_error(rc, "cuModuleLoadData"); return cuda_module; }
WEAK void halide_init_kernels(void *user_context, const char* ptx_src, int size) { // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context. if (cuda_ctx_ptr == NULL) { cuda_ctx_ptr = &weak_cuda_ctx; } // Initialize one shared context for all Halide compiled instances if (*cuda_ctx_ptr == 0) { // Initialize CUDA CHECK_CALL( cuInit(0), "cuInit" ); // Make sure we have a device int deviceCount = 0; CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" ); halide_assert(user_context, deviceCount > 0); char *device_str = getenv("HL_GPU_DEVICE"); CUdevice dev; // Get device CUresult status; if (device_str) { status = cuDeviceGet(&dev, atoi(device_str)); } else { // Try to get a device >0 first, since 0 should be our display device // For now, don't try devices > 2 to maintain compatibility with previous behavior. if (deviceCount > 2) deviceCount = 2; for (int id = deviceCount - 1; id >= 0; id--) { status = cuDeviceGet(&dev, id); if (status == CUDA_SUCCESS) break; } } halide_assert(user_context, status == CUDA_SUCCESS && "Failed to get device\n"); #ifdef DEBUG halide_printf(user_context, "Got device %d, about to create context (t=%lld)\n", dev, (long long)halide_current_time_ns(user_context)); #endif // Create context CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" ); } else { //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" ); } // Initialize a module for just this Halide module if (!__mod) { // Create module CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" ); #ifdef DEBUG halide_printf(user_context, "-------\nCompiling PTX:\n%s\n--------\n", ptx_src); #endif } // Create two events for timing if (!__start) { cuEventCreate(&__start, 0); cuEventCreate(&__end, 0); } }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, int unpaddedPatternCount, int tipCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); unsigned int ctxFlags = CU_CTX_SCHED_AUTO; if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { ctxFlags |= CU_CTX_MAP_HOST; } CUresult error = cuCtxCreate(&cudaContext, ctxFlags, cudaDevice); if(error != CUDA_SUCCESS) { fprintf(stderr, "CUDA error: \"%s\" (%d) from file <%s>, line %i.\n", GetCUDAErrorDescription(error), error, __FILE__, __LINE__); if (error == CUDA_ERROR_INVALID_DEVICE) { fprintf(stderr, "(The requested CUDA device is likely set to compute exclusive mode. This mode prevents multiple processes from running on the device.)"); } exit(-1); } InitializeKernelResource(paddedStateCount, flags & BEAGLE_FLAG_PRECISION_DOUBLE); if (!kernelResource) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->unpaddedPatternCount = unpaddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); if ((paddedPatternCount < BEAGLE_MULTI_GRID_MAX || flags & BEAGLE_FLAG_PARALLELOPS_GRID) && !(flags & BEAGLE_FLAG_PARALLELOPS_STREAMS)) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = tipCount/2 + 1; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues) { return cuModuleLoadData(module, image); }
// load/read kernel from 'program' file/string, compile and return the requested function CUresult ptxJIT(const char* program, const char* functionName, CUmodule *phModule, CUfunction *phKernel, CUlinkState *lState, bool bFromFile) { CUjit_option options[6]; void *optionVals[6]; float walltime(0); const unsigned logSize(8192); char error_log[logSize], info_log[logSize]; void *cuOut; size_t outSize; // Setup linker options // Return walltime from JIT compilation options[0] = CU_JIT_WALL_TIME; optionVals[0] = (void*)&walltime; // Pass a buffer for info messages options[1] = CU_JIT_INFO_LOG_BUFFER; optionVals[1] = (void*)info_log; // Pass the size of the info buffer options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optionVals[2] = (void*)(long)logSize; // Pass a buffer for error message options[3] = CU_JIT_ERROR_LOG_BUFFER; optionVals[3] = (void*)error_log; // Pass the size of the error buffer options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optionVals[4] = (void*)(long)logSize; // Make the linker verbose options[5] = CU_JIT_LOG_VERBOSE; optionVals[5] = (void*)1; // Create a pending linker invocation checkCudaErrors(cuLinkCreate(6, options, optionVals, lState)); DEBUG("Loading '%s' program", functionName); CUresult myErr; if (bFromFile) { // Load the PTX from the file (64-bit) myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, program, 0, 0, 0); } else { // Load the PTX from the string myPtx (64-bit) myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)program, strlen(program)+1, 0, 0, 0, 0); } if (myErr != CUDA_SUCCESS) { // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. VERBOSE("PTX Linker Error: %s", error_log); return myErr; } // Complete the linker step checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize)); // Linker walltime and info_log were requested in options above. DEBUG("CUDA link completed (%gms):\n%s", walltime, info_log); // Load resulting cuBin into module checkCudaErrors(cuModuleLoadData(phModule, cuOut)); // Locate the kernel entry point checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, functionName)); // Destroy the linker invocation checkCudaErrors(cuLinkDestroy(*lState)); return CUDA_SUCCESS; }
T run_function(const std::string& name, const T input, const int shiftValue) { const std::string test_source = "//\n" "// Generated by NVIDIA NVVM Compiler\n" "//\n" "// Compiler Build ID: CL-19856038\n" "// Cuda compilation tools, release 7.5, V7.5.17\n" "// Based on LLVM 3.4svn\n" "//\n" "\n" ".version 4.3\n" ".target sm_20\n" ".address_size 64\n" "\n" " // .globl _Z10kernel_s32Piii\n" "\n" ".visible .entry _Z10kernel_s32Piii(\n" " .param .u64 _Z10kernel_s32Piii_param_0,\n" " .param .u32 _Z10kernel_s32Piii_param_1,\n" " .param .u32 _Z10kernel_s32Piii_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s32Piii_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_s32Piii_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_s32Piii_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.s32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_s64Pxxi\n" ".visible .entry _Z10kernel_s64Pxxi(\n" " .param .u64 _Z10kernel_s64Pxxi_param_0,\n" " .param .u64 _Z10kernel_s64Pxxi_param_1,\n" " .param .u32 _Z10kernel_s64Pxxi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s64Pxxi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_s64Pxxi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_s64Pxxi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.s64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u32Pjji\n" ".visible .entry _Z10kernel_u32Pjji(\n" " .param .u64 _Z10kernel_u32Pjji_param_0,\n" " .param .u32 _Z10kernel_u32Pjji_param_1,\n" " .param .u32 _Z10kernel_u32Pjji_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u32Pjji_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_u32Pjji_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_u32Pjji_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.u32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u64Pyyi\n" ".visible .entry _Z10kernel_u64Pyyi(\n" " .param .u64 _Z10kernel_u64Pyyi_param_0,\n" " .param .u64 _Z10kernel_u64Pyyi_param_1,\n" " .param .u32 _Z10kernel_u64Pyyi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u64Pyyi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_u64Pyyi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_u64Pyyi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.u64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" "\n" ; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str())); T output; CUdeviceptr devOutput; cu_assert(cuMemAlloc(&devOutput, sizeof(output))); void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue}; auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output))); cu_assert(cuMemFree(devOutput)); cu_assert(cuModuleUnload(modId)); return output; }