CAMLprim value spoc_cuda_debug_compile(value moduleSrc, value function_name, value gi){ CAMLparam3(moduleSrc, function_name, gi); CUmodule module; CUfunction *kernel; char* functionN; char *ptx_source; const unsigned int jitNumOptions = 4; CUjit_option jitOptions[4]; void *jitOptVals[4]; int jitLogBufferSize; char *jitLogBuffer; int jitRegCount = 32; BLOCKING_CUDA_GET_CONTEXT; kernel = malloc(sizeof(CUfunction)); functionN = String_val(function_name); ptx_source = String_val(moduleSrc); // set up size of compilation log buffer jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; jitLogBufferSize = 1024; jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize); jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[2] = CU_JIT_MAX_REGISTERS; jitOptVals[2] = (void *)(size_t)jitRegCount; jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT; //CU_JIT_TARGET; // jitOptVals[3] = (void*)(uintptr_t)CU_TARGET_COMPUTE_10; cuda_error = (cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals)); if (cuda_error) { fprintf (stderr,"%s\n", jitLogBuffer); fflush (stderr); } cuda_error = (cuModuleGetFunction(kernel, module, functionN)); if (cuda_error) { fprintf (stderr, "%s\n", jitLogBuffer); fflush (stderr); } BLOCKING_CUDA_RESTORE_CONTEXT; free(jitLogBuffer); CAMLreturn((value) kernel); }
GpuCompilationContext::GpuCompilationContext(const void* image, const std::string& kernel_name, const int device_id, const void* cuda_mgr, unsigned int num_options, CUjit_option* options, void** option_vals) : module_(nullptr), kernel_(nullptr), device_id_(device_id), cuda_mgr_(cuda_mgr) { static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_); checkCudaErrors(cuModuleLoadDataEx(&module_, image, num_options, options, option_vals)); CHECK(module_); checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name.c_str())); }
/* * returns the cubin = gpu machine code form the PTX ISA assembly * setup JIT compilation options and perform compilation */ CUmodule * CudaCompiler::compilePTX(uchar * KernelPTXDump, GPU * gpu=0) { // consider the default context is in run if gpu==0 // TODO : make it mandatory if(gpu!=0) CUDCHK( cuCtxPushCurrent(gpu->context) ); //CUDCHK( cuCtxSynchronize()); // in this branch we use compilation with parameters const unsigned int jitNumOptions = 3; int jitLogBufferSize = 1024; int jitRegCount = 32; CUjit_option * jitOptions = new CUjit_option[jitNumOptions]; void ** jitOptVals = new void*[jitNumOptions]; char * jitLogBuffer = new char[jitLogBufferSize]; jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;// set up size of compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;// set up pointer to the compilation log buffer jitOptions[2] = CU_JIT_MAX_REGISTERS; // set up pointer to set the Maximum # of registers for a particular kernel jitOptVals[0] = (void *)(size_t)jitLogBufferSize; jitOptVals[1] = jitLogBuffer; jitOptVals[2] = (void *)(size_t)jitRegCount; // compile with set parameters qDebug()<<"> Compiling PTX module"; CUmodule * cuModule = new CUmodule(); CUDCHK( cuModuleLoadDataEx( cuModule, KernelPTXDump, jitNumOptions, jitOptions, (void **)jitOptVals) ); qDebug()<< "PTX JIT log: \n [" << jitLogBuffer <<"]" ; delete [] jitOptions; delete [] jitOptVals; delete [] jitLogBuffer; CUDCHK( cuCtxPopCurrent(0)); return cuModule; }
CUresult CuContext::LoadModuleFilenameEx(const std::string& filename, ModulePtr* ppModule, uint maxRegisters) { std::ifstream file(filename.c_str()); if(!file.good()) return CUDA_ERROR_FILE_NOT_FOUND; std::string contents(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(0)); ModulePtr module(new CuModule); CUjit_option options[1] = { CU_JIT_MAX_REGISTERS }; uint values[1] = { maxRegisters }; CUresult result = cuModuleLoadDataEx(&module->_module, contents.c_str(), 1, options, (void**)values); HANDLE_RESULT(); module->_context = this; ppModule->swap(module); return CUDA_SUCCESS; }
/// main - Program entry point int main(int argc, char** argv) { if (argc != 3) { printf("Usage: %s dataCount blockSize\n", argv[0]); exit(1); } CUdevice device; CUmodule cudaModule; CUcontext context; CUfunction function; CUlinkState linker; int devCount; // CUDA initialization checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGetCount(&devCount)); checkCudaErrors(cuDeviceGet(&device, 0)); char name[128]; checkCudaErrors(cuDeviceGetName(name, 128, device)); std::cout << "Using CUDA Device [0]: " << name << "\n"; int devMajor, devMinor; checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); std::cout << "Device Compute Capability: " << devMajor << "." << devMinor << "\n"; if (devMajor < 2) { std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; return 1; } std::ifstream t("kernel.ptx"); if (!t.is_open()) { std::cerr << "kernel.ptx not found\n"; return 1; } std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); // Create driver context checkCudaErrors(cuCtxCreate(&context, 0, device)); // Create module for object checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0)); // Get kernel function checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel")); // Device data CUdeviceptr devBufferA; CUdeviceptr devBufferB; CUdeviceptr devBufferC; CUdeviceptr devBufferSMid; // Size unsigned dataCount = atoi(argv[1]); checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferSMid, sizeof(int) * dataCount)); float* hostA = new float[dataCount]; float* hostB = new float[dataCount]; float* hostC = new float[dataCount]; int* hostSMid = new int[dataCount]; // Populate input for (unsigned i = 0; i != dataCount; ++i) { hostA[i] = (float)i; hostB[i] = (float)(2 * i); hostC[i] = 2.0f; hostSMid[i] = 0; } checkCudaErrors( cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float) * dataCount)); checkCudaErrors( cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float) * dataCount)); unsigned blockSizeX = atoi(argv[2]); unsigned blockSizeY = 1; unsigned blockSizeZ = 1; unsigned gridSizeX = (dataCount + blockSizeX - 1) / blockSizeX; unsigned gridSizeY = 1; unsigned gridSizeZ = 1; // Kernel parameters void* KernelParams[] = {&devBufferA, &devBufferB, &devBufferC, &devBufferSMid}; std::cout << "Launching kernel\n"; // Kernel launch checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ, 0, NULL, KernelParams, NULL)); // Retrieve device data checkCudaErrors( cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float) * dataCount)); checkCudaErrors( cuMemcpyDtoH(&hostSMid[0], devBufferSMid, sizeof(int) * dataCount)); std::cout << "Results:\n"; std::cout << "SM " << hostSMid[0] << ":" << hostA[0] << " + " << hostB[0] << " = " << hostC[0] << "\n"; for (unsigned i = 1; i != dataCount; i++) { if (hostSMid[i] != hostSMid[i - 1]) std::cout << "SM " << hostSMid[i] << ":" << hostA[i] << " + " << hostB[i] << " = " << hostC[i] << "\n"; } // Clean up after ourselves delete[] hostA; delete[] hostB; delete[] hostC; delete[] hostSMid; // Clean-up checkCudaErrors(cuMemFree(devBufferA)); checkCudaErrors(cuMemFree(devBufferB)); checkCudaErrors(cuMemFree(devBufferC)); checkCudaErrors(cuMemFree(devBufferSMid)); checkCudaErrors(cuModuleUnload(cudaModule)); checkCudaErrors(cuCtxDestroy(context)); return 0; }
void RTC_VIT(unsigned int number, const char* GPU_kernel, HMMER_PROFILE *hmm, unsigned int *seq_1D, unsigned int *offset, unsigned int *seq_len, unsigned int *iLen, unsigned int sum, double *pVal, int warp, int maxreg, dim3 GRID, dim3 BLOCK) { /*********************************/ /* 0. Prepare for cuda drive API */ /*********************************/ CUdevice cuDevice; CUcontext context; CUmodule module; CUfunction kernel; checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGet(&cuDevice, 0)); checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); /*********************************************/ /* 1. Device Property: fixed based on Device */ /*********************************************/ /****************************************/ /* 2. Device Memory Allocation and copy */ /****************************************/ StopWatchInterface *timer; sdkCreateTimer(&timer); sdkStartTimer(&timer); /* Driver API pointers */ CUdeviceptr d_seq, d_offset, d_len, d_len_6r, mat_v, trans, score; /* Allocation */ checkCudaErrors(cuMemAlloc(&d_seq, sum * sizeof(unsigned int))); /* copy 1D database */ checkCudaErrors(cuMemAlloc(&d_offset, number * sizeof(unsigned int))); /* copy offset of each seq*/ checkCudaErrors(cuMemAlloc(&d_len, number * sizeof(unsigned int))); /* copy raw length of each seq */ checkCudaErrors(cuMemAlloc(&d_len_6r, number * sizeof(unsigned int))); /* copy padding length of each seq */ checkCudaErrors(cuMemAlloc(&mat_v, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__))); /* striped EMISSION score */ checkCudaErrors(cuMemAlloc(&trans, hmm->vitQ * TRANS_TYPE * sizeof(__32int__))); /* striped transition score */ checkCudaErrors(cuMemAlloc(&score, number * sizeof(double))); /* P-Value as output */ /* H to D copy */ checkCudaErrors(cuMemcpyHtoD(d_seq, seq_1D, sum * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_offset, offset, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_len, seq_len, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_len_6r, iLen, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(mat_v, hmm->vit_vec, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__))); checkCudaErrors(cuMemcpyHtoD(trans, hmm->trans_vec, hmm->vitQ * TRANS_TYPE * sizeof(__32int__))); sdkStopTimer(&timer); printf("Alloc & H to D Copy time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /********************************************************/ /* 3. Runtime compilation, Generate PTX and Load module */ /********************************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); /* NVRTC create handle */ nvrtcProgram prog; NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, // prog GPU_kernel, // buffer NULL, // name: CUDA program name. name can be NULL; “default_program” is used when it is NULL. 0, // numHeaders (I put header file path with -I later) NULL, // headers' content NULL)); // include full name of headers /* 1. eliminate const through pointer */ char *a = NULL; const char *b = a; const char **opts = &b; /* 2. elminate const through reference */ //char a_value = 'c'; //char* aa = &a_value; //const char *&bb = aa; // no way with const //const char**&ref = aa; // no way /* Dynamic Options */ char **test_char = new char*[8]; test_char[0] = new char[__INCLUDE__.length() + strlen("simd_def.h") + 1]; // #include simd_def.h strcpy(test_char[0], get_option(__INCLUDE__, "simd_def.h").c_str()); test_char[1] = new char[__INCLUDE__.length() + strlen("simd_functions.h") + 1]; // #include simd_functions.h strcpy(test_char[1], get_option(__INCLUDE__, "simd_functions.h").c_str()); test_char[2] = new char[__RDC__.length() + __F__.length() + 1]; // -rdc=false strcpy(test_char[2], get_option(__RDC__, __F__).c_str()); test_char[3] = new char[__ARCH__.length() + __CC35__.length() + 1]; // -arch=compute_35 strcpy(test_char[3], get_option(__ARCH__, __CC35__).c_str()); test_char[4] = new char[__MAXREG__.length() + int2str(maxreg).length() + 1]; // -maxrregcount = <?> strcpy(test_char[4], get_option(__MAXREG__, int2str(maxreg)).c_str()); test_char[5] = new char[__RIB__.length() + int2str(warp).length() + 1]; // #define RIB <?> : warps per block strcpy(test_char[5], get_option(__RIB__, int2str(warp)).c_str()); test_char[6] = new char[__SIZE__.length() + int2str((int)force_local_size).length() + 1]; // #define SIZE 40 strcpy(test_char[6], get_option(__SIZE__, int2str((int)force_local_size)).c_str()); test_char[7] = new char[__Q__.length() + int2str(hmm->vitQ).length() + 1]; // #define Q <?> strcpy(test_char[7], get_option(__Q__, int2str(hmm->vitQ)).c_str()); /* 1. change const char** through pointer */ //char* **test = const_cast<char** *>(&opts); //*test = test_char; /* 2. change const char** through reference */ char** &test_ref = const_cast<char** &>(opts); test_ref = test_char; /* NVRTC compile */ NVRTC_SAFE_CALL("nvrtcCompileProgram", nvrtcCompileProgram(prog, // prog 8, // numOptions opts)); // options sdkStopTimer(&timer); printf("nvrtc Creat and Compile: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); //======================================================================================// // /* dump log */ // // size_t logSize; // // NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize)); // // char *log = (char *) malloc(sizeof(char) * logSize + 1); // // NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log)); // // log[logSize] = '\x0'; // // std::cerr << "\n compilation log ---\n"; // // std::cerr << log; // // std::cerr << "\n end log ---\n"; // // free(log); // //======================================================================================// /* NVRTC fetch PTX */ sdkCreateTimer(&timer); sdkStartTimer(&timer); size_t ptxsize; NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxsize)); char *ptx = new char[ptxsize]; NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx)); NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog)); // destroy program instance /* Launch PTX by driver API */ checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); checkCudaErrors(cuModuleGetFunction(&kernel, module, "KERNEL")); // return the handle of function, name is the same as real kernel function sdkStopTimer(&timer); printf("Compile & Load time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /**************************************/ /* 4. GPU kernel launch by driver API */ /**************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1); /* parameters for kernel funciton */ void *arr[] = { &d_seq, &number, &d_offset, &score, &d_len, &d_len_6r, &mat_v, &trans, &(hmm->base_vs), &(hmm->E_lm), &(hmm->ddbound_vs), &(hmm->scale_w), &(hmm->vitQ), &(hmm->MU[1]), &(hmm->LAMBDA[1])}; /* launch kernel */ checkCudaErrors(cuLaunchKernel( kernel, GRID.x, GRID.y, GRID.z, /* grid dim */ BLOCK.x, BLOCK.y, BLOCK.z, /* block dim */ 0,0, /* SMEM, stream */ &arr[0], /* kernel params */ 0)); /* extra opts */ /* wait for kernel finish */ checkCudaErrors(cuCtxSynchronize()); /* block for a context's task to complete */ sdkStopTimer(&timer); printf("Kernel time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /*****************************************/ /* 5. P-value return and post-processing */ /*****************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); checkCudaErrors(cuMemcpyDtoH(pVal, score, number * sizeof(double))); sdkStopTimer(&timer); printf("D to H copy time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /* count the number of seqs pass */ unsigned long pass_vit = 0; /* # of seqs pass vit */ for (int i = 0; i < number; i++) { if (pVal[i] <= F2) pass_vit++; } printf("| PASS VIT \n"); printf("| ALL | FWD |\n"); printf("| %d | %d |\n", pass_vit, pass_vit); /************************/ /* 6. clean the context */ /************************/ checkCudaErrors(cuDevicePrimaryCtxReset(cuDevice)); /* reset */ checkCudaErrors(cuCtxSynchronize()); /* block for a context's task to complete */ }