int main(int argc, char* argv[]) { //int iTest = 2896; //while (iTest < 0x7fff) //{ // int iResult = iTest * iTest; // float fTest = (float)iTest; // int fResult = (int)(fTest * fTest); // printf("i*i:%08x f*f:%08x\n", iResult, fResult); // iTest += 0x0800; //} //exit(0); char deviceName[32]; int devCount, ordinal, major, minor; CUdevice hDevice; // Initialize the Driver API and find a device CUDA_CHECK( cuInit(0) ); CUDA_CHECK( cuDeviceGetCount(&devCount) ); for (ordinal = 0; ordinal < devCount; ordinal++) { CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); if (major >= 5 && minor >= 2) { printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); break; } } if (ordinal == devCount) { printf("No compute 5.0 device found, exiting.\n"); exit(EXIT_FAILURE); } // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing int internalTiming = 1; if (argc > 1) internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; // Second command line arg is the number of blocks int blocks = 1; if (argc > 2) blocks = atoi(argv[2]); if (blocks < 1) blocks = 1; // Third command line arg is the number of threads int threads = 128; if (argc > 3) threads = atoi(argv[3]); if (threads > 1024 || threads < 32) threads = 128; threads &= -32; // Forth command line arg: double fops = 1.0; int lanes = 1; if (argc > 4) { if (internalTiming) { // The number of lanes to print for each warp lanes = atoi(argv[4]); if (lanes > 32 || lanes < 1) lanes = 1; } else // The number of floating point operations in a full kernel launch fops = atof(argv[4]); } // Fifth command line arg is the repeat count for benchmarking int repeat = 1; if (argc > 5) repeat = atoi(argv[5]); if (repeat > 1000 || repeat < 1) repeat = 1; // threads = total number of threads size_t size = sizeof(int) * threads * blocks; // Setup our input and output buffers int* dataIn = (int*)malloc(size); int* dataOut = (int*)malloc(size); int* clocks = (int*)malloc(size); memset(dataIn, 0, size); CUmodule hModule; CUfunction hKernel; CUevent hStart, hStop; CUdeviceptr devIn, devOut, devClocks; // Init our context and device memory buffers CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); CUDA_CHECK( cuMemAlloc(&devIn, size) ); CUDA_CHECK( cuMemAlloc(&devOut, size) ); CUDA_CHECK( cuMemAlloc(&devClocks, size) ); CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); // Load our kernel CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); // Setup the params void* params[] = { &devOut, &devClocks, &devIn }; float ms = 0; // Warm up the clock (unless under nsight) if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER for (int i = 0; i < repeat; i++) CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); // Launch the kernel CUDA_CHECK( cuEventRecord(hStart, NULL) ); //CUDA_CHECK( cuProfilerStart() ); CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); //CUDA_CHECK( cuProfilerStop() ); CUDA_CHECK( cuEventRecord(hStop, NULL) ); CUDA_CHECK( cuEventSynchronize(hStop) ); CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); //CUDA_CHECK( cuCtxSynchronize() ); // Get back our results from each kernel CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); // Cleanup and shutdown of cuda CUDA_CHECK( cuEventDestroy(hStart) ); CUDA_CHECK( cuEventDestroy(hStop) ); CUDA_CHECK( cuModuleUnload(hModule) ); CUDA_CHECK( cuMemFree(devIn) ); CUDA_CHECK( cuMemFree(devOut) ); CUDA_CHECK( cuMemFree(devClocks) ); CUDA_CHECK( cuCtxDestroy(hContext) ); hContext = 0; // When using just one block, print out the internal timing data if (internalTiming) { int count = 0, total = 0, min = 999999, max = 0; int* clocks_p = clocks; int* dataOut_p = dataOut; // Loop over and print results for (int blk = 0; blk < blocks; blk++) { float *fDataOut = reinterpret_cast<float*>(dataOut_p); for(int tid = 0; tid < threads; tid += 32) { // Sometimes we want data on each thread, sometimes just one sample per warp is fine for (int lane = 0; lane < lanes; lane++) printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u count++; total += clocks_p[tid]; if (clocks_p[tid] < min) min = clocks_p[tid]; if (clocks_p[tid] > max) max = clocks_p[tid]; } clocks_p += threads; dataOut_p += threads; } printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); } else { // For more than one block we're testing throughput and want external timing data printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); } // And free up host memory free(dataIn); free(dataOut); free(clocks); return 0; }
void GPUInterface::LaunchKernelConcurrent(GPUFunction deviceFunction, Dim3Int block, Dim3Int grid, int streamIndex, int waitIndex, int parameterCountV, int totalParameterCount, ...) { // parameters #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernelConcurrent\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); void** params; GPUPtr* paramPtrs; unsigned int* paramInts; params = (void**)malloc(sizeof(void*) * totalParameterCount); paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount); paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount); va_list parameters; va_start(parameters, totalParameterCount); for(int i = 0; i < parameterCountV; i++) { paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr); params[i] = (void*)¶mPtrs[i]; } for(int i = parameterCountV; i < totalParameterCount; i++) { paramInts[i-parameterCountV] = va_arg(parameters, unsigned int); params[i] = (void*)¶mInts[i-parameterCountV]; } va_end(parameters); if (streamIndex >= 0) { int streamIndexMod = streamIndex % numStreams; if (waitIndex >= 0) { int waitIndexMod = waitIndex % numStreams; SAFE_CUDA(cuStreamWaitEvent(cudaStreams[streamIndexMod], cudaEvents[waitIndexMod], 0)); } SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, cudaStreams[streamIndexMod], params, NULL)); SAFE_CUDA(cuEventRecord(cudaEvents[streamIndexMod], cudaStreams[streamIndexMod])); } else { SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, cudaStreams[0], params, NULL)); } free(params); free(paramPtrs); free(paramInts); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::LaunchKernelConcurrent\n"); #endif }
extern "C" void binomialOptionsGPU( real *callValue, TOptionData *optionData, int optN, int argc, char **argv ) { if (!moduleLoaded) { kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]); compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); module = loadPTX(ptx, argc, argv); moduleLoaded = true; } __TOptionData h_OptionData[MAX_OPTIONS]; for (int i = 0; i < optN; i++) { const real T = optionData[i].T; const real R = optionData[i].R; const real V = optionData[i].V; const real dt = T / (real)NUM_STEPS; const real vDt = V * sqrt(dt); const real rDt = R * dt; //Per-step interest and discount factors const real If = exp(rDt); const real Df = exp(-rDt); //Values and pseudoprobabilities of upward and downward moves const real u = exp(vDt); const real d = exp(-vDt); const real pu = (If - d) / (u - d); const real pd = (real)1.0 - pu; const real puByDf = pu * Df; const real pdByDf = pd * Df; h_OptionData[i].S = (real)optionData[i].S; h_OptionData[i].X = (real)optionData[i].X; h_OptionData[i].vDt = (real)vDt; h_OptionData[i].puByDf = (real)puByDf; h_OptionData[i].pdByDf = (real)pdByDf; } CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel")); CUdeviceptr d_OptionData; checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData")); checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); dim3 cudaBlockSize(128,1,1); dim3 cudaGridSize(optN, 1, 1); checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ NULL, /* arguments */ 0)); checkCudaErrors(cuCtxSynchronize()); CUdeviceptr d_CallValue; checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue")); checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real))); }
void RTC_VIT(unsigned int number, const char* GPU_kernel, HMMER_PROFILE *hmm, unsigned int *seq_1D, unsigned int *offset, unsigned int *seq_len, unsigned int *iLen, unsigned int sum, double *pVal, int warp, int maxreg, dim3 GRID, dim3 BLOCK) { /*********************************/ /* 0. Prepare for cuda drive API */ /*********************************/ CUdevice cuDevice; CUcontext context; CUmodule module; CUfunction kernel; checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGet(&cuDevice, 0)); checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); /*********************************************/ /* 1. Device Property: fixed based on Device */ /*********************************************/ /****************************************/ /* 2. Device Memory Allocation and copy */ /****************************************/ StopWatchInterface *timer; sdkCreateTimer(&timer); sdkStartTimer(&timer); /* Driver API pointers */ CUdeviceptr d_seq, d_offset, d_len, d_len_6r, mat_v, trans, score; /* Allocation */ checkCudaErrors(cuMemAlloc(&d_seq, sum * sizeof(unsigned int))); /* copy 1D database */ checkCudaErrors(cuMemAlloc(&d_offset, number * sizeof(unsigned int))); /* copy offset of each seq*/ checkCudaErrors(cuMemAlloc(&d_len, number * sizeof(unsigned int))); /* copy raw length of each seq */ checkCudaErrors(cuMemAlloc(&d_len_6r, number * sizeof(unsigned int))); /* copy padding length of each seq */ checkCudaErrors(cuMemAlloc(&mat_v, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__))); /* striped EMISSION score */ checkCudaErrors(cuMemAlloc(&trans, hmm->vitQ * TRANS_TYPE * sizeof(__32int__))); /* striped transition score */ checkCudaErrors(cuMemAlloc(&score, number * sizeof(double))); /* P-Value as output */ /* H to D copy */ checkCudaErrors(cuMemcpyHtoD(d_seq, seq_1D, sum * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_offset, offset, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_len, seq_len, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(d_len_6r, iLen, number * sizeof(unsigned int))); checkCudaErrors(cuMemcpyHtoD(mat_v, hmm->vit_vec, hmm->vitQ * PROTEIN_TYPE * sizeof(__32int__))); checkCudaErrors(cuMemcpyHtoD(trans, hmm->trans_vec, hmm->vitQ * TRANS_TYPE * sizeof(__32int__))); sdkStopTimer(&timer); printf("Alloc & H to D Copy time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /********************************************************/ /* 3. Runtime compilation, Generate PTX and Load module */ /********************************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); /* NVRTC create handle */ nvrtcProgram prog; NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, // prog GPU_kernel, // buffer NULL, // name: CUDA program name. name can be NULL; “default_program” is used when it is NULL. 0, // numHeaders (I put header file path with -I later) NULL, // headers' content NULL)); // include full name of headers /* 1. eliminate const through pointer */ char *a = NULL; const char *b = a; const char **opts = &b; /* 2. elminate const through reference */ //char a_value = 'c'; //char* aa = &a_value; //const char *&bb = aa; // no way with const //const char**&ref = aa; // no way /* Dynamic Options */ char **test_char = new char*[8]; test_char[0] = new char[__INCLUDE__.length() + strlen("simd_def.h") + 1]; // #include simd_def.h strcpy(test_char[0], get_option(__INCLUDE__, "simd_def.h").c_str()); test_char[1] = new char[__INCLUDE__.length() + strlen("simd_functions.h") + 1]; // #include simd_functions.h strcpy(test_char[1], get_option(__INCLUDE__, "simd_functions.h").c_str()); test_char[2] = new char[__RDC__.length() + __F__.length() + 1]; // -rdc=false strcpy(test_char[2], get_option(__RDC__, __F__).c_str()); test_char[3] = new char[__ARCH__.length() + __CC35__.length() + 1]; // -arch=compute_35 strcpy(test_char[3], get_option(__ARCH__, __CC35__).c_str()); test_char[4] = new char[__MAXREG__.length() + int2str(maxreg).length() + 1]; // -maxrregcount = <?> strcpy(test_char[4], get_option(__MAXREG__, int2str(maxreg)).c_str()); test_char[5] = new char[__RIB__.length() + int2str(warp).length() + 1]; // #define RIB <?> : warps per block strcpy(test_char[5], get_option(__RIB__, int2str(warp)).c_str()); test_char[6] = new char[__SIZE__.length() + int2str((int)force_local_size).length() + 1]; // #define SIZE 40 strcpy(test_char[6], get_option(__SIZE__, int2str((int)force_local_size)).c_str()); test_char[7] = new char[__Q__.length() + int2str(hmm->vitQ).length() + 1]; // #define Q <?> strcpy(test_char[7], get_option(__Q__, int2str(hmm->vitQ)).c_str()); /* 1. change const char** through pointer */ //char* **test = const_cast<char** *>(&opts); //*test = test_char; /* 2. change const char** through reference */ char** &test_ref = const_cast<char** &>(opts); test_ref = test_char; /* NVRTC compile */ NVRTC_SAFE_CALL("nvrtcCompileProgram", nvrtcCompileProgram(prog, // prog 8, // numOptions opts)); // options sdkStopTimer(&timer); printf("nvrtc Creat and Compile: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); //======================================================================================// // /* dump log */ // // size_t logSize; // // NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize)); // // char *log = (char *) malloc(sizeof(char) * logSize + 1); // // NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log)); // // log[logSize] = '\x0'; // // std::cerr << "\n compilation log ---\n"; // // std::cerr << log; // // std::cerr << "\n end log ---\n"; // // free(log); // //======================================================================================// /* NVRTC fetch PTX */ sdkCreateTimer(&timer); sdkStartTimer(&timer); size_t ptxsize; NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxsize)); char *ptx = new char[ptxsize]; NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx)); NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog)); // destroy program instance /* Launch PTX by driver API */ checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); checkCudaErrors(cuModuleGetFunction(&kernel, module, "KERNEL")); // return the handle of function, name is the same as real kernel function sdkStopTimer(&timer); printf("Compile & Load time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /**************************************/ /* 4. GPU kernel launch by driver API */ /**************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1); /* parameters for kernel funciton */ void *arr[] = { &d_seq, &number, &d_offset, &score, &d_len, &d_len_6r, &mat_v, &trans, &(hmm->base_vs), &(hmm->E_lm), &(hmm->ddbound_vs), &(hmm->scale_w), &(hmm->vitQ), &(hmm->MU[1]), &(hmm->LAMBDA[1])}; /* launch kernel */ checkCudaErrors(cuLaunchKernel( kernel, GRID.x, GRID.y, GRID.z, /* grid dim */ BLOCK.x, BLOCK.y, BLOCK.z, /* block dim */ 0,0, /* SMEM, stream */ &arr[0], /* kernel params */ 0)); /* extra opts */ /* wait for kernel finish */ checkCudaErrors(cuCtxSynchronize()); /* block for a context's task to complete */ sdkStopTimer(&timer); printf("Kernel time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /*****************************************/ /* 5. P-value return and post-processing */ /*****************************************/ sdkCreateTimer(&timer); sdkStartTimer(&timer); checkCudaErrors(cuMemcpyDtoH(pVal, score, number * sizeof(double))); sdkStopTimer(&timer); printf("D to H copy time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); /* count the number of seqs pass */ unsigned long pass_vit = 0; /* # of seqs pass vit */ for (int i = 0; i < number; i++) { if (pVal[i] <= F2) pass_vit++; } printf("| PASS VIT \n"); printf("| ALL | FWD |\n"); printf("| %d | %d |\n", pass_vit, pass_vit); /************************/ /* 6. clean the context */ /************************/ checkCudaErrors(cuDevicePrimaryCtxReset(cuDevice)); /* reset */ checkCudaErrors(cuCtxSynchronize()); /* block for a context's task to complete */ }
///////////////////////////////////////////////////// // Main program ///////////////////////////////////////////////////// int main(int argc, char **argv) { typedef long clock_t; unsigned int num_warps = NUM_BLOCKS * NUM_THREADS / 32; // we allocate two timer for each warp clock_t *timer = (clock_t*)malloc(num_warps * sizeof(clock_t) * 2); // Initialize CUDA driver checkCudaErrors(cuInit(0)); // Get number of devices supporting CUDA int deviceCount = 0; checkCudaErrors(cuDeviceGetCount(&deviceCount)); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit (0); } // Get handle for device 0 CUdevice cuDevice; checkCudaErrors(cuDeviceGet(&cuDevice, 0)); // Create context CUcontext cuContext; checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); // JIT compile the kernel from PTX and get the handle CUfunction kernel_addr; CUmodule cuModule; ptxJIT(&cuModule, &kernel_addr, "clock.ptx", "timeDummy"); // Allocate timer on device CUdeviceptr dtimer; checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * num_warps * 2)); dim3 cudaBlockSize(NUM_THREADS, 1, 1); dim3 cudaGridSize(NUM_BLOCKS, 1, 1); void *kernel_param[] = {(void*)&dtimer}; checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &kernel_param[0], 0)); // Sync the context checkCudaErrors(cuCtxSynchronize()); // copy result back to host checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * num_warps * 2)); // Compute the execution time of the kernel clock_t minStart = timer[0]; clock_t maxEnd = timer[num_warps]; for (int i = 1 ; i < num_warps ; i ++){ minStart = timer[i] < minStart ? timer[i] : minStart; maxEnd = timer[num_warps + i] > maxEnd ? timer[num_warps + i] : maxEnd; } printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart)); printf("Number of warps = %u\n", num_warps); // Clean up free(timer); checkCudaErrors(cuMemFree(dtimer)); checkCudaErrors(cuModuleUnload(cuModule)); checkCudaErrors(cuCtxDestroy(cuContext)); return EXIT_SUCCESS; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; const int N = 10; int i; CUstream streams[N]; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; for (i = 0; i < N; i++) { streams[i] = (CUstream) acc_get_cuda_stream (i); if (streams[i] != NULL) abort (); r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (i, streams[i])) abort (); } for (i = 0; i < N; i++) { r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.0f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
int lud_launch(CUmodule mod, CUdeviceptr m, int matrix_dim) { int i = 0; int bdx, bdy, gdx, gdy; int shared_size; float *m_debug = (float*)malloc(matrix_dim * matrix_dim * sizeof(float)); CUfunction f_diagonal, f_perimeter, f_internal; CUresult res; /* get functions. */ res = cuModuleGetFunction(&f_diagonal, mod, "_Z12lud_diagonalPfii"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction(f_diagonal) failed\n"); return 0; } res = cuModuleGetFunction(&f_perimeter, mod, "_Z13lud_perimeterPfii"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction(f_perimeter) failed\n"); return 0; } res = cuModuleGetFunction(&f_internal, mod, "_Z12lud_internalPfii"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction(f_internal) failed\n"); return 0; } for (i = 0; i < matrix_dim - BLOCK_SIZE; i += BLOCK_SIZE) { void* param[] = {(void*) &m, (void*) &matrix_dim, (void*) &i}; /* diagonal */ gdx = 1; gdy = 1; bdx = BLOCK_SIZE; bdy = 1; shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float); res = cuLaunchKernel(f_diagonal, gdx, gdy, 1, bdx, bdy, 1, shared_size, 0, (void**) param, NULL); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(f_diagonal) failed: res = %u\n", res); return 0; } /* perimeter */ gdx = (matrix_dim - i) / BLOCK_SIZE - 1; gdy = 1; bdx = BLOCK_SIZE * 2; bdy = 1; shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float) * 3; res = cuLaunchKernel(f_perimeter, gdx, gdy, 1, bdx, bdy, 1, shared_size, 0, (void**) param, NULL); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(f_perimeter) failed: res = %u\n", res); return 0; } /* internal */ gdx = (matrix_dim - i) / BLOCK_SIZE - 1; gdy = (matrix_dim - i) / BLOCK_SIZE - 1; bdx = BLOCK_SIZE; bdy = BLOCK_SIZE; shared_size = BLOCK_SIZE * BLOCK_SIZE * sizeof(float) * 2; res = cuLaunchKernel(f_internal, gdx, gdy, 1, bdx, bdy, 1, shared_size, 0, (void**) param, NULL); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(internal) failed: res = %u\n", res); return 0; } } void* param[] = {(void*) &m, (void*) &matrix_dim, (void*) &i}; /* diagonal */ gdx = 1; gdy = 1; res = cuLaunchKernel(f_diagonal, gdx, gdy, 1, bdx, bdy, 1, shared_size, 0, (void**) param, NULL); if (res != CUDA_SUCCESS) { printf("cuLaunchKernel(f_diagonal) failed: res = %u\n", res); return 0; } free(m_debug); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay2; CUmodule module; CUresult r; int N; int i; CUstream *streams; unsigned long **a, **d_a, *tid, ticks; int nbytes; void *kargs[3]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay2, module, "delay2"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = sizeof (int); ticks = (unsigned long) (200.0 * clkrate); N = nprocs; streams = (CUstream *) malloc (N * sizeof (void *)); a = (unsigned long **) malloc (N * sizeof (unsigned long *)); d_a = (unsigned long **) malloc (N * sizeof (unsigned long *)); tid = (unsigned long *) malloc (N * sizeof (unsigned long)); for (i = 0; i < N; i++) { a[i] = (unsigned long *) malloc (sizeof (unsigned long)); *a[i] = N; d_a[i] = (unsigned long *) acc_malloc (nbytes); tid[i] = i; acc_map_data (a[i], d_a[i], nbytes); streams[i] = (CUstream) acc_get_cuda_stream (i); if (streams[i] != NULL) abort (); r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (i, streams[i])) abort (); } for (i = 0; i < N; i++) { kargs[0] = (void *) &d_a[i]; kargs[1] = (void *) &ticks; kargs[2] = (void *) &tid[i]; r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } ticks = (unsigned long) (50.0 * clkrate); } acc_wait_all_async (0); for (i = 0; i < N; i++) { acc_copyout (a[i], nbytes); if (*a[i] != i) abort (); } free (streams); for (i = 0; i < N; i++) { free (a[i]); } free (a); free (d_a); free (tid); acc_shutdown (acc_device_nvidia); exit (0); }
T run_function(const std::string& name, const T input, const int shiftValue) { const std::string test_source = "//\n" "// Generated by NVIDIA NVVM Compiler\n" "//\n" "// Compiler Build ID: CL-19856038\n" "// Cuda compilation tools, release 7.5, V7.5.17\n" "// Based on LLVM 3.4svn\n" "//\n" "\n" ".version 4.3\n" ".target sm_20\n" ".address_size 64\n" "\n" " // .globl _Z10kernel_s32Piii\n" "\n" ".visible .entry _Z10kernel_s32Piii(\n" " .param .u64 _Z10kernel_s32Piii_param_0,\n" " .param .u32 _Z10kernel_s32Piii_param_1,\n" " .param .u32 _Z10kernel_s32Piii_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s32Piii_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_s32Piii_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_s32Piii_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.s32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_s64Pxxi\n" ".visible .entry _Z10kernel_s64Pxxi(\n" " .param .u64 _Z10kernel_s64Pxxi_param_0,\n" " .param .u64 _Z10kernel_s64Pxxi_param_1,\n" " .param .u32 _Z10kernel_s64Pxxi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s64Pxxi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_s64Pxxi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_s64Pxxi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.s64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u32Pjji\n" ".visible .entry _Z10kernel_u32Pjji(\n" " .param .u64 _Z10kernel_u32Pjji_param_0,\n" " .param .u32 _Z10kernel_u32Pjji_param_1,\n" " .param .u32 _Z10kernel_u32Pjji_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u32Pjji_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_u32Pjji_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_u32Pjji_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.u32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u64Pyyi\n" ".visible .entry _Z10kernel_u64Pyyi(\n" " .param .u64 _Z10kernel_u64Pyyi_param_0,\n" " .param .u64 _Z10kernel_u64Pyyi_param_1,\n" " .param .u32 _Z10kernel_u64Pyyi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u64Pyyi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_u64Pyyi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_u64Pyyi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.u64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" "\n" ; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str())); T output; CUdeviceptr devOutput; cu_assert(cuMemAlloc(&devOutput, sizeof(output))); void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue}; auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output))); cu_assert(cuMemFree(devOutput)); cu_assert(cuModuleUnload(modId)); return output; }
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if __CUDA_API_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) checkCudaErrors(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface checkCudaErrors(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch checkCudaErrors(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface checkCudaErrors(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch checkCudaErrors(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); checkCudaErrors(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); checkCudaErrors(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); checkCudaErrors(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions CUresult status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %p, retval = %d\n", fpFunc, status); return status; } return status; }