//host driver void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int nrhs, hostdrv_pars_t *prhs) { //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSizeX)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imgSizeX); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSizeY)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imgSizeY); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
//----------------------------------------------------------------------------// bool CUDAImpl::_LaunchKernel(Kernel & kernel, const CUfunction & cudaKernel, std::string * err) { // Set CUDA kernel arguments CUresult c_err; int paramOffset = 0; for(size_t i = 0; i < kernel.inBuffers.size(); ++i) { c_err = cuParamSetv(cudaKernel, paramOffset, &_cudaBuffers[kernel.inBuffers[i].buffer->name], sizeof(void*)); paramOffset += sizeof(void *); } for(size_t i = 0; i < kernel.outBuffers.size(); ++i) { c_err = cuParamSetv(cudaKernel, paramOffset, &_cudaBuffers[kernel.outBuffers[i].buffer->name], sizeof(void*)); paramOffset += sizeof(void *); } for(size_t i = 0; i < kernel.paramsInt.size(); ++i) { c_err = cuParamSetv(cudaKernel, paramOffset, &kernel.paramsInt[i].value, sizeof(int)); paramOffset += sizeof(int); } for(size_t i = 0; i < kernel.paramsFloat.size(); ++i) { c_err = cuParamSetv(cudaKernel, paramOffset, &kernel.paramsFloat[i].value, sizeof(float)); paramOffset += sizeof(float); } // int and width parameters c_err = cuParamSetv(cudaKernel, paramOffset, &_w, sizeof(int)); paramOffset += sizeof(int); c_err = cuParamSetv(cudaKernel, paramOffset, &_h, sizeof(int)); paramOffset += sizeof(int); // It should be fine to check once all the arguments have been set if(_cudaErrorCheckParamSet(c_err, err, kernel.name)) { return false; } c_err = cuParamSetSize(cudaKernel, paramOffset); if (_cudaErrorParamSetSize(c_err, err, kernel.name)) { return false; } // Launch the CUDA kernel const int nBlocksHor = _w / 16 + 1; const int nBlocksVer = _h / 16 + 1; cuFuncSetBlockShape(cudaKernel, 16, 16, 1); c_err = cuLaunchGrid(cudaKernel, nBlocksHor, nBlocksVer); if (_cudaErrorLaunchKernel(c_err, err, kernel.name)) { return false; } return true; }
/** * Invokes the kernel @f on a @gridDimX x @gridDimY x @gridDimZ grid of blocks. * Each block contains @blockDimX x @blockDimY x @blockDimZ threads. * @sharedMemBytes sets the amount of dynamic shared memory that will be * available to each thread block. * * cuLaunchKernel() can optionally be associated to a stream by passing a * non-zero hStream argument. * * Kernel parameters to @f can be specified in one of two ways: * * 1) Kernel parameters can be specified via kernelParams. If f has N * parameters, then kernelParams needs to be an array of N pointers. Each of * kernelParams[0] through kernelParams[N-1] must point to a region of memory * from which the actual kernel parameter will be copied. The number of kernel * parameters and their offsets and sizes do not need to be specified as that * information is retrieved directly from the kernel's image. * * 2) Kernel parameters can also be packaged by the application into a single * buffer that is passed in via the extra parameter. This places the burden on * the application of knowing each kernel parameter's size and alignment/ * padding within the buffer. Here is an example of using the extra parameter * in this manner: * * size_t argBufferSize; * char argBuffer[256]; * * // populate argBuffer and argBufferSize * * void *config[] = { * CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, * CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, * CU_LAUNCH_PARAM_END * }; * status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); * * The extra parameter exists to allow cuLaunchKernel to take additional less * commonly used arguments. extra specifies a list of names of extra settings * and their corresponding values. Each extra setting name is immediately * followed by the corresponding value. The list must be terminated with * either NULL or CU_LAUNCH_PARAM_END. * * CU_LAUNCH_PARAM_END, which indicates the end of the extra array; * CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next value in * extra will be a pointer to a buffer containing all the kernel parameters * for launching kernel f; * CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra * will be a pointer to a size_t containing the size of the buffer specified * with CU_LAUNCH_PARAM_BUFFER_POINTER; * * The error CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters * are specified with both kernelParams and extra (i.e. both kernelParams and * extra are non-NULL). * * Calling cuLaunchKernel() sets persistent function state that is the same as * function state set through the following deprecated APIs: * * cuFuncSetBlockShape() cuFuncSetSharedSize() cuParamSetSize() cuParamSeti() * cuParamSetf() cuParamSetv() * * When the kernel @f is launched via cuLaunchKernel(), the previous block * shape, shared size and parameter info associated with @f is overwritten. * * Note that to use cuLaunchKernel(), the kernel @f must either have been * compiled with toolchain version 3.2 or later so that it will contain kernel * parameter information, or have no kernel parameters. If either of these * conditions is not met, then cuLaunchKernel() will return * CUDA_ERROR_INVALID_IMAGE. * * Parameters: * f - Kernel to launch * gridDimX - Width of grid in blocks * gridDimY - Height of grid in blocks * gridDimZ - Depth of grid in blocks * blockDimX - X dimension of each thread block * blockDimY - Y dimension of each thread block * blockDimZ - Z dimension of each thread block * sharedMemBytes - Dynamic shared-memory size per thread block in bytes * hStream - Stream identifier * kernelParams - Array of pointers to kernel parameters * extra - Extra options * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_HANDLE, * CUDA_ERROR_INVALID_IMAGE, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_LAUNCH_FAILED, * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, CUDA_ERROR_LAUNCH_TIMEOUT, * CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * CUDA_ERROR_SHARED_OBJECT_INIT_FAILED */ CUresult cuLaunchKernel (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra) { struct gdev_cuda_raw_func *rf; CUresult res; int i; if (hStream) { GDEV_PRINT("cuLaunchKernel: Stream is not supported.\n"); return CUDA_ERROR_INVALID_HANDLE; } if (extra) { GDEV_PRINT("cuLaunchKernel: Extra Parameters are not supported.\n"); return CUDA_ERROR_INVALID_HANDLE; } res = cuFuncSetSharedSize(f, sharedMemBytes); if (res != CUDA_SUCCESS) return res; res = cuFuncSetBlockShape(f, blockDimX, blockDimY, blockDimZ); if (res != CUDA_SUCCESS) return res; rf = &f->raw_func; for (i = 0; i < rf->param_count; i++) { void *p = kernelParams[i]; int offset = rf->param_info[i].offset; uint32_t size = rf->param_info[i].size; cuParamSetv(f, offset, p, size); } res = cuParamSetSize(f, rf->param_size); if (res != CUDA_SUCCESS) return res; res = cuLaunchGrid(f, gridDimX, gridDimY); if (res != CUDA_SUCCESS) return res; return CUDA_SUCCESS; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, jint total_size, jint num_blocks){ void * cubin_file; int offset; CUresult status; heapEndPtr = heap_end_ptr; //void * cubin_file = readCubinFile("code_file.cubin"); cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size); status = cuModuleLoadData(&cuModule, cubin_file); CHECK_STATUS(env,"error in cuModuleLoad",status) free(cubin_file); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
CUresult loadAndRunTestFunction(CUmodule *phModule, std::string name, CUdeviceptr &d_data, DataStruct *h_data, unsigned int memSize, int thread_x=1,int thread_y=1,int thread_z=1, int block_x=1, int block_y=1, int block_z=1) { // std::cout << " Start Loading" << std::endl; // load data the to device cuMemcpyHtoD(d_data, h_data, memSize); // Locate the kernel entry point CUfunction phKernel = 0; CUresult status = cuModuleGetFunction(&phKernel, *phModule, name.data()); if (status != CUDA_SUCCESS) {printf("ERROR: could not load function\n");} // Set the kernel parameters status = cuFuncSetBlockShape(phKernel, thread_x, thread_y, thread_z); if (status != CUDA_SUCCESS) {printf("ERROR: during setBlockShape\n");} int paramOffset = 0; status = cuParamSetv(phKernel, paramOffset, &d_data, sizeof(DataStruct*)); paramOffset += sizeof(DataStruct*); status = cuParamSetSize(phKernel, paramOffset); if (status != CUDA_SUCCESS) {printf("ERROR: during cuParamSetv\n");} // Launch the kernel status = cuLaunchGrid(phKernel, block_x, block_y); if (status != CUDA_SUCCESS) {printf("ERROR: during grid launch\n");} // std::cout << " launched CUDA kernel!!" << std::endl; // Copy the result back to the host status = cuMemcpyDtoH(h_data, d_data, memSize); if (status != CUDA_SUCCESS) {printf("ERROR: during MemcpyDtoH\n");} }
int main(int argc, char *argv[]) { srand(time(NULL)); for(int k=0;k<4;k++) { int n = 30*(k+1); float x = ((float) rand()) / (float) RAND_MAX; float *a = new float[n+1]; float resultGPU; for(int i = 0; i < n + 1; i++) a[i] = i * 0.5*((float) rand()) / (float) RAND_MAX; int blocks = (n + 1) / BLK_SZ; if((n + 1) % BLK_SZ) blocks++; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction hFunction; CALL( cuInit(0) ); CALL( cuDeviceGet(&hDevice, 0) ); CALL( cuCtxCreate(&hContext, 0, hDevice) ); CALL( cuModuleLoad(&hModule, "kernel.cubin") ); CALL( cuModuleGetFunction(&hFunction, hModule, "Polynomial") ); //dane wejsciowe - kopiowanie CUdeviceptr DevA, DevResult; CALL( cuMemAlloc(&DevA, (n+1)*sizeof(float) ) ); CALL( cuMemAlloc(&DevResult, sizeof(float) ) ); CALL( cuMemcpyHtoD(DevA, a, (n+1)*sizeof(float) ) ); CALL( cuFuncSetBlockShape(hFunction, BLK_SZ, 1, 1) ); //przekazanie parametrow do kernela int offset = 0; void *ptr; ptr = (void*)(size_t)DevResult; ALIGN_UP(offset, __alignof(ptr)); CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) ); offset += sizeof(ptr); ptr = (void*)(size_t)DevA; ALIGN_UP(offset, __alignof(ptr)); CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) ); offset += sizeof(ptr); ALIGN_UP(offset, __alignof(float)); CALL( cuParamSetf(hFunction, offset, x) ); offset += sizeof(float); ALIGN_UP(offset, __alignof(int)); CALL( cuParamSeti(hFunction, offset, n) ); offset += sizeof(int); CALL( cuParamSetSize(hFunction, offset) ); CALL( cuLaunchGrid(hFunction, blocks, 1) ); //kopiowanie wyniku na hosta CALL( cuMemcpyDtoH((void *) &resultGPU, DevResult, sizeof(float) ) ); //zwalnianie pamieci na urzadzeniu CALL( cuMemFree(DevA) ); CALL( cuMemFree(DevResult) ); //obliczenia na CPU float resultCPU = PolynomialCPU(a, x, n); std::cout << "GPU:\t" << resultGPU << std::endl; std::cout << "CPU:\t" << resultCPU << std::endl; std::cout << "roznica:\t" << fabs(resultGPU - resultCPU) << std::endl; delete [] a; } return 0; }
int cuda_test_fmadd(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmadd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
void swanRunKernelAsync( const char *kernel, block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types ) { // find the kernel if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0 CUfunction f = NULL; int i; int offset = 0; CUresult err; int type; int idx=0; try_init(); for( i=0; i < state.num_funcs; i++ ) { if( !strcmp( state.func_names[i], kernel ) ) { f = state.funcs[i]; break; } } if( f == NULL ) { for( i=0; i < state.num_mods; i++ ) { cuModuleGetFunction( &f, state.mods[i], kernel ); if( f!= NULL ) { // found a kernel. store it for future use int j = state.num_funcs; state.num_funcs++; state.funcs = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs ); state.funcs[j] = f; state.func_names = (char**) realloc( state.func_names, sizeof(char*) * state.num_funcs ); state.func_names[j] = (char*) malloc( strlen(kernel) + 1 ); strcpy( state.func_names[j], kernel ); break; } } } if( f== NULL ) { fprintf(stderr, "Error running kernel [%s] : \n", kernel ); error( "No kernel found" ); } if( grid.z != 1 ) { printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z ); error( "grid.z needs to be 1" ); } //printf("Running kernel [%s]\n", kernel ); type = types[idx]; while( type != SWAN_END ) { void *ptr = ptrs[idx]; switch( type ) { // DEBLOCK( SWAN_uchar, uchar, 1 ); DEBLOCK( SWAN_uchar2, uchar2, 2 ); DEBLOCK( SWAN_uchar3, uchar3, 1 ); DEBLOCK( SWAN_uchar4, uchar4, 4 ); DEBLOCK( SWAN_char , int, 1 ); // DEBLOCK( SWAN_char1 , char1, 1 ); DEBLOCK( SWAN_char2 , char2, 2 ); DEBLOCK( SWAN_char3 , char3, 1 ); DEBLOCK( SWAN_char4 , char4, 4 ); DEBLOCK( SWAN_int, int, 4 ); // DEBLOCK( SWAN_int1, int1, 4 ); DEBLOCK( SWAN_int2, int2, 8 ); DEBLOCK( SWAN_int3, int3, 4 ); DEBLOCK( SWAN_int4, int4, 16 ); // DEBLOCK( SWAN_float, double, 4 ); // DEBLOCK( SWAN_float1, float1, 4 ); DEBLOCK( SWAN_float2, float2, 8 ); DEBLOCK( SWAN_float3, float3, 4 ); DEBLOCK( SWAN_float4, float4, 16 ); DEBLOCK( SWAN_uint, uint, 4 ); DEBLOCK( SWAN_uint2, uint2, 8 ); DEBLOCK( SWAN_uint3, uint3, 4 ); DEBLOCK( SWAN_uint4, uint4, 16 ); DEBLOCK( SWAN_float, float, 4 ); //#define DEBLOCK(swan_type,type,OFFSET) #if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4 case SWAN_PTR: { //printf("PTR as NATIVE\n"); ALIGN_UP( offset, (sizeof(void*))); cuParamSetv( f, offset, ptr, sizeof(void*) ); offset += sizeof(void*); } break; #else case SWAN_PTR: { //printf("PTR as INT\n"); ALIGN_UP( offset, (sizeof(int))); cuParamSetv( f, offset, ptr, sizeof(int) ); offset += sizeof(int); } break; #endif default: printf("%d\n", type ); error("Parameter type not handled\n"); } idx++; type = types[idx]; } //printf("Launching kernel [%s] [%X] with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z ); //printf(" TOTAL OFFSET %d\n", offset ); CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) ); #if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4 cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd #endif err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ; if( err != CUDA_SUCCESS ) { fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z ); assert(0); exit(-99); } //printf("Kernel completed\n" ); }
JNIEXPORT void JNICALL Java_org_trifort_rootbeer_runtime_CUDAContext_cudaRun (JNIEnv *env, jobject this_ref, jint device_index, jbyteArray cubin_file, jint cubin_length, jint block_shape_x, jint grid_shape_x, jint num_threads, jobject object_mem, jobject handles_mem, jobject exceptions_mem, jobject class_mem) { CUresult status; CUdevice device; CUcontext context; CUmodule module; CUfunction function; void * fatcubin; int offset; int info_space_size; CUdeviceptr gpu_info_space; CUdeviceptr gpu_object_mem; CUdeviceptr gpu_handles_mem; CUdeviceptr gpu_exceptions_mem; CUdeviceptr gpu_class_mem; CUdeviceptr gpu_heap_end; CUdeviceptr gpu_buffer_size; void * cpu_object_mem; void * cpu_handles_mem; void * cpu_exceptions_mem; void * cpu_class_mem; jlong cpu_object_mem_size; jlong cpu_handles_mem_size; jlong cpu_exceptions_mem_size; jlong cpu_class_mem_size; jlong cpu_heap_end; jclass cuda_memory_class; jmethodID get_address_method; jmethodID get_size_method; jmethodID get_heap_end_method; jlong * info_space; //---------------------------------------------------------------------------- //init device and function //---------------------------------------------------------------------------- status = cuDeviceGet(&device, device_index); CHECK_STATUS(env, "Error in cuDeviceGet", status, device) status = cuCtxCreate(&context, CU_CTX_MAP_HOST, device); CHECK_STATUS(env,"Error in cuCtxCreate", status, device) fatcubin = malloc(cubin_length); (*env)->GetByteArrayRegion(env, cubin_file, 0, cubin_length, fatcubin); status = cuModuleLoadFatBinary(&module, fatcubin); CHECK_STATUS(env, "Error in cuModuleLoad", status, device) free(fatcubin); status = cuModuleGetFunction(&function, module, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env, "Error in cuModuleGetFunction", status, device) //---------------------------------------------------------------------------- //get handles from java //---------------------------------------------------------------------------- cuda_memory_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/FixedMemory"); get_address_method = (*env)->GetMethodID(env, cuda_memory_class, "getAddress", "()J"); get_size_method = (*env)->GetMethodID(env, cuda_memory_class, "getSize", "()J"); get_heap_end_method = (*env)->GetMethodID(env, cuda_memory_class, "getHeapEndPtr", "()J"); cpu_object_mem = (void *) (*env)->CallLongMethod(env, object_mem, get_address_method); cpu_object_mem_size = (*env)->CallLongMethod(env, object_mem, get_size_method); cpu_heap_end = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); cpu_handles_mem = (void *) (*env)->CallLongMethod(env, handles_mem, get_address_method); cpu_handles_mem_size = (*env)->CallLongMethod(env, handles_mem, get_size_method); cpu_exceptions_mem = (void *) (*env)->CallLongMethod(env, exceptions_mem, get_address_method); cpu_exceptions_mem_size = (*env)->CallLongMethod(env, exceptions_mem, get_size_method); cpu_class_mem = (void *) (*env)->CallLongMethod(env, class_mem, get_address_method); cpu_class_mem_size = (*env)->CallLongMethod(env, class_mem, get_size_method); info_space_size = 1024; info_space = (jlong *) malloc(info_space_size); info_space[1] = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); //---------------------------------------------------------------------------- //allocate mem //---------------------------------------------------------------------------- status = cuMemAlloc(&gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_info_mem", status, device) status = cuMemAlloc(&gpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_object_mem", status, device) status = cuMemAlloc(&gpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_handles_mem", status, device) status = cuMemAlloc(&gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_exceptions_mem", status, device) status = cuMemAlloc(&gpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_class_mem", status, device) status = cuMemAlloc(&gpu_heap_end, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_heap_end", status, device) status = cuMemAlloc(&gpu_buffer_size, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_buffer_size", status, device) //---------------------------------------------------------------------------- //set function parameters //---------------------------------------------------------------------------- status = cuParamSetSize(function, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env, "Error in cuParamSetSize", status, device) offset = 0; status = cuParamSetv(function, offset, (void *) &gpu_info_space, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv gpu_info_space", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_object_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_object_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_handles_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_handles_mem %", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_heap_end, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_heap_end", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_buffer_size, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_buffer_size", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_exceptions_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_exceptions_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_class_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_class_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSeti(function, offset, num_threads); CHECK_STATUS(env, "Error in cuParamSetv: num_threads", status, device) offset += sizeof(int); //---------------------------------------------------------------------------- //copy data //---------------------------------------------------------------------------- status = cuMemcpyHtoD(gpu_info_space, info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: info_space", status, device) status = cuMemcpyHtoD(gpu_object_mem, cpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_object_mem", status, device) status = cuMemcpyHtoD(gpu_handles_mem, cpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_handles_mem", status, device) status = cuMemcpyHtoD(gpu_class_mem, cpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_class_mem", status, device) status = cuMemcpyHtoD(gpu_heap_end, &cpu_heap_end, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_heap_end", status, device) status = cuMemcpyHtoD(gpu_buffer_size, &cpu_object_mem_size, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_buffer_size", status, device) status = cuMemcpyHtoD(gpu_exceptions_mem, cpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //launch //---------------------------------------------------------------------------- status = cuFuncSetBlockShape(function, block_shape_x, 1, 1); CHECK_STATUS(env, "Error in cuFuncSetBlockShape", status, device); status = cuLaunchGrid(function, grid_shape_x, 1); CHECK_STATUS(env, "Error in cuLaunchGrid", status, device) status = cuCtxSynchronize(); CHECK_STATUS(env, "Error in cuCtxSynchronize", status, device) //---------------------------------------------------------------------------- //copy data back //---------------------------------------------------------------------------- status = cuMemcpyDtoH(info_space, gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_info_space", status, device) cpu_heap_end = info_space[1]; status = cuMemcpyDtoH(cpu_object_mem, gpu_object_mem, cpu_heap_end); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_object_mem", status, device) status = cuMemcpyDtoH(cpu_exceptions_mem, gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //free resources //---------------------------------------------------------------------------- free(info_space); cuMemFree(gpu_info_space); cuMemFree(gpu_object_mem); cuMemFree(gpu_handles_mem); cuMemFree(gpu_exceptions_mem); cuMemFree(gpu_class_mem); cuMemFree(gpu_heap_end); cuMemFree(gpu_buffer_size); cuCtxDestroy(context); }
//host driver //void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) { void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){ //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); unsigned int maxthreads = 65000; // Set threads per block here. unsigned int blocksdim1d = 256; dim3 threads(blocksdim1d, 1, 1); int nstreams = iDivUp(N, maxthreads*blocksdim1d); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * blocksdim1d; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * blocksdim1d; else size = maxthreads * blocksdim1d; int gridx = iDivUp(size, blocksdim1d); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); ALIGN_UP(poffset, __alignof(imx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imx); ALIGN_UP(poffset, __alignof(imy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imy); ALIGN_UP(poffset, __alignof(outx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outx); ALIGN_UP(poffset, __alignof(outy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outy); ALIGN_UP(poffset, __alignof(poolx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(poolx); ALIGN_UP(poffset, __alignof(pooly)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(pooly); // if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { // mexErrMsgTxt("Error in cuParamSeti"); // } // poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
void load_and_test(CUmodule cuModule, char * test_name) { try { CUfunction proc; test(cuModuleGetFunction(&proc, cuModule, test_name), "cuModuleGetFunction"); int max = 1000; bool * h_R = (bool*)malloc(max * sizeof(bool)); memset(h_R, 0, max * sizeof(bool)); CUdeviceptr d_R; test(cuMemAlloc(&d_R, max * sizeof(bool)), "cuMemAlloc"); test(cuMemcpyHtoD(d_R, h_R, max * sizeof(bool)), "cuMemcpyHtoD"); CUdeviceptr d_N; int h_N = 0; test(cuMemAlloc(&d_N, sizeof(int)), "cuMemAlloc"); test(cuMemcpyHtoD(d_N, &h_N, sizeof(int)), "cuMemcpyHtoD"); int offset = 0; void* ptr; ptr = (void*)(size_t)d_R; ALIGN_UP(offset, __alignof(ptr)); test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv"); offset += sizeof(ptr); ptr = (void*)(size_t)d_N; ALIGN_UP(offset, __alignof(ptr)); test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv"); offset += sizeof(ptr); test(cuParamSetSize(proc, offset), "cuParamSetSize"); int threadsPerBlock = 1; int blocksPerGrid = 1; test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape"); test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid"); test(cuMemcpyDtoH(h_R, d_R, max * sizeof(bool)), "cuMemcpyDtoH"); test(cuMemcpyDtoH(&h_N, d_N, sizeof(int)), "cuMemcpyDtoH"); test(cuMemFree(d_R), "cuMemFree"); test(cuMemFree(d_N), "cuMemFree"); bool failed = false; for (int i = 0; i < h_N; ++i) { if (h_R[i] == 0) { failed = true; std::cout << "\nTest " << i << " failed.\n"; std::cout.flush(); } } if (! failed) std::cout << test_name << " passed.\n"; else { std::cout << test_name << " failed.\n"; } } catch (...) { std::string s = test_name; s = s.append(" crashed.\n"); test(1, s.c_str()); } }
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if CUDA_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); cutilDrvSafeCall(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status); return status; } return status; }
int main(void) { // Initialize if (cuInit(0) != CUDA_SUCCESS) exit (0); // Get number of devices supporting CUDA int deviceCount = 0; cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit (0); } // Get handle for device 0 CUdevice cuDevice = 0; cuDeviceGet(&cuDevice, 0); // Create context CUcontext cuContext; cuCtxCreate(&cuContext, 0, cuDevice); // Create module from binary file CUmodule cuModule; cuModuleLoad(&cuModule, “VecAdd.ptx”); // Get function handle from module CUfunction vecAdd; cuModuleGetFunction(&vecAdd, cuModule, "VecAdd"); // Allocate vectors in device memory size_t size = N * sizeof(float); CUdeviceptr d_A; cuMemAlloc(&d_A, size); CUdeviceptr d_B; cuMemAlloc(&d_B, size); CUdeviceptr d_C; cuMemAlloc(&d_C, size); // Copy vectors from host memory to device memory // h_A and h_B are input vectors stored in host memory cuMemcpyHtoD(d_A, h_A, size); cuMemcpyHtoD(d_B, h_B, size); // Invoke kernel #define ALIGN_UP(offset, alignment) \ (offset) = ((offset) + (alignment) – 1) & ~((alignment) – 1) int offset = 0; ALIGN_UP(offset, __alignof(d_A)); cuParamSetv(vecAdd, offset, &d_A, sizeof(d_A)); offset += sizeof(d_A); ALIGN_UP(offset, __alignof(d_B)); cuParamSetv(vecAdd, offset, &d_B, sizeof(d_B)); offset += sizeof(d_B); ALIGN_UP(offset, __alignof(d_C)); cuParamSetv(vecAdd, offset, &d_C, sizeof(d_C)); offset += sizeof(d_C); cuParamSetSize(VecAdd, offset); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock – 1) / threadsPerBlock; cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); cuLaunchGrid(VecAdd, blocksPerGrid, 1); // Copy result from device memory to host memory // h_C contains the result in host memory cuMemcpyDtoH(h_C, d_C, size); // Free device memory cuMemFree(d_A); cuMemFree(d_B); cuMemFree(d_C); return (0); }
/* * parameter setting * provide kernel with needed parameter when it be launched */ void parameter_set(void){ res = cuParamSeti(function, 0, x_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(x) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 4, x_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(x) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 8, v_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(v) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 12, v_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(v) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSetv(function, 16, &a, 8); if(res != CUDA_SUCCESS){ printf("cuParamSetv(a) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 24, error_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 28, error_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 32, s_time_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 36, s_time_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSetSize(function, 40); if(res != CUDA_SUCCESS){ printf("cuParaMSetSize() failed: res = %s\n", conv(res)); exit(1); } }
void Function::setParameter(int offset, void *data, unsigned int len) const { detail::error_check(cuParamSetv(impl->func, offset, data, len), "Can't set Cuda function parameter"); }
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if !GIB_USE_MMAP if (buf_size > gib_buf_size) { int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int i, j; int n = c->n; int m = c->m; unsigned char A[128*128], inv[128*128], modA[128*128]; for (i = n; i < n+recover_last; i++) if (buf_ids[i] >= n) { fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n"); return GIB_ERR; } gib_galois_gen_A(A, m+n, n); /* Modify the matrix to have the failed drives reflected */ for (i = 0; i < n; i++) for (j = 0; j < n; j++) modA[i*n+j] = A[buf_ids[i]*n+j]; gib_galois_gaussian_elim(modA, inv, n, n); /* Copy row buf_ids[i] into row i */ for (i = n; i < n+recover_last; i++) for (j = 0; j < n; j++) modA[i*n+j] = inv[buf_ids[i]*n+j]; int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n))); #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)gpu_c->buffers; #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, sizeof(recover_last))); offset += sizeof(recover_last); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1)); #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size)); #else cuCtxSynchronize(); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int gib_generate ( void *buffers, int buf_size, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); /* Do it all at once if the buffers are small enough */ #if !GIB_USE_MMAP /* This is too large to do at once in the GPU memory we have allocated. * Split it into several noncontiguous jobs. */ if (buf_size > gib_buf_size) { int rc = gib_generate_nc(buffers, buf_size, buf_size, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; unsigned char F[256*256]; gib_galois_gen_F(F, c->m, c->n); CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n))); #if !GIB_USE_MMAP /* Copy the buffers to memory */ ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif /* Configure and launch */ ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)(gpu_c->buffers); #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1)); /* Get the results back */ #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size)); #else ERROR_CHECK_FAIL(cuCtxSynchronize()); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int main(int argc, char *argv[]) { argc--; argv++; // Instruction-level test of PTX assembly language and emulator. // This test should work natively and under emulation. Many of the // instructions tested here stress many poorly documented features // of the PTX assembly language. If the emulator passes these // tests, then it can surely pass code that is generated by the // nvcc compiler. test(cuInit(0), "cuInit"); int deviceCount = 0; test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount"); int device = 0; if (argc) device = atoi(*argv); CUdevice cuDevice = 0; test(cuDeviceGet(&cuDevice, device), "cuDeviceGet"); CUcontext cuContext; int xxx = cuCtxCreate(&cuContext, 0, cuDevice); CUmodule cuModule; test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad"); // Do basic test. No sense continuing if we cannot complete this // test. try { CUfunction proc; test(cuModuleGetFunction(&proc, cuModule, "InstBasic"), "cuModuleGetFunction"); bool * h_R = (bool*)malloc(sizeof(bool)); memset(h_R, 0, sizeof(bool)); CUdeviceptr d_R; test(cuMemAlloc(&d_R, sizeof(bool)), "cuMemAlloc"); test(cuMemcpyHtoD(d_R, h_R, sizeof(bool)), "cuMemcpyHtoD"); int offset = 0; void* ptr; ptr = (void*)(size_t)d_R; ALIGN_UP(offset, __alignof(ptr)); test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv"); offset += sizeof(ptr); test(cuParamSetSize(proc, offset), "cuParamSetSize"); int threadsPerBlock = 1; int blocksPerGrid = 1; test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape"); test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid"); test(cuMemcpyDtoH(h_R, d_R, sizeof(bool)), "cuMemcpyDtoH"); test(cuMemFree(d_R), "cuMemFree"); if (h_R[0] == 1) std::cout << "Basic test passed.\n"; else { std::cout << "Basic test failed.\n"; exit(1); } } catch (...) { test(1, "test crashed."); } // Do LD, ST, MOV test. load_and_test(cuModule, "InstLSMC"); // Do ADD, SUB test. load_and_test(cuModule, "InstAddSub"); return 0; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char** argv) { CUcontext cuContext; // initialize CUDA CUfunction pk = NULL; const char cubin_name [] = "pass_kernel.cubin"; const char kernel_name [] = "pass_kernel"; CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name)); printf("initCuda-returned CUfunction:\n"); // cuParamSetx, x=i f v // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment #define ALIGN_UP(offset, alignment) \ (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) size_t offset = 0; // input integers // CU paramset i. for(int i = 0 ; i < NUM_ARG ; i++) { int align = __alignof(int); ALIGN_UP(offset, align); cuParamSeti(pk, offset, i); printf ("offset %d = %d\n", i, offset); offset += sizeof(int); } // return array for updated inputs int size_int = sizeof(int); int size_array = size_int * NUM_ARG; CUdeviceptr d_return_values; cuMemAlloc (&d_return_values, size_array); void* ptr = (void*)(size_t)d_return_values; int align = __alignof(ptr); ALIGN_UP(offset, align); cuParamSetv(pk, offset, &ptr, sizeof(ptr)); printf("return values offset:%d\n", offset); offset += sizeof(ptr); CUdeviceptr d_return_N; cuMemAlloc(&d_return_N, size_int); void* ptrN = (void*)(size_t)d_return_N; int alignN = __alignof(ptrN); ALIGN_UP(offset, alignN); cuParamSetv(pk, offset, &ptrN, sizeof(ptr)); printf("return int offset:%d\n", offset); offset += sizeof(ptrN); // Calling kernel int BLOCK_SIZE_X = NUM_ARG; int BLOCK_SIZE_Y = 1; int BLOCK_SIZE_Z = 1; int GRID_SIZE = 1; cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z)); printf("paramsetsize:%d\n", offset); CU_SAFE_CALL(cuParamSetSize(pk, offset)); CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE)); int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array)); CU_SAFE_CALL(cuMemFree(d_return_values)); for(int i=0;i<NUM_ARG;i++) printf("%dth value = %d\n", i, h_return_values[i]); free(h_return_values); int* h_return_N = (int*)malloc(sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int)); CU_SAFE_CALL(cuMemFree(d_return_N)); printf("%d sizeof array\n", *h_return_N); if(cuContext !=NULL) cuCtxDetach(cuContext); }
// Host code int main() { int N = 3; size_t size = N * sizeof(float); float* h_A = (float*)malloc(size); float* h_B = (float*)malloc(size); float* h_C = (float*)malloc(size); // Set up vectors. for (int i = 0; i < N; ++i) { h_A[i] = i * 1.0; h_B[i] = i * 1.0 + 1; h_C[i] = 0; printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]); } // Initialize if (cuInit(0) != CUDA_SUCCESS) exit (0); // Get number of devices supporting CUDA int deviceCount = 0; cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit (0); } // Get handle for device 0 CUdevice cuDevice = 0; CUresult r1 = cuDeviceGet(&cuDevice, 0); // Create context CUcontext cuContext; cuCtxCreate(&cuContext, 0, cuDevice); // Create module from binary file CUmodule cuModule; CUresult r2 = cuModuleLoad(&cuModule, "VecAdd.ptx"); // Get function handle from module CUfunction vecAdd; CUresult r3 = cuModuleGetFunction(&vecAdd, cuModule, "VecAdd"); // Allocate vectors in device memory CUdeviceptr d_A; CUresult r4 = cuMemAlloc(&d_A, size); CUdeviceptr d_B; CUresult r5 = cuMemAlloc(&d_B, size); CUdeviceptr d_C; CUresult r6 = cuMemAlloc(&d_C, size); // Copy vectors from host memory to device memory // h_A and h_B are input vectors stored in host memory CUresult r7 = cuMemcpyHtoD(d_A, h_A, size); CUresult r8 = cuMemcpyHtoD(d_B, h_B, size); // Invoke kernel #define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) int offset = 0; void* ptr; ptr = (void*)(size_t)d_A; ALIGN_UP(offset, __alignof(ptr)); CUresult r9 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(size_t)d_B; ALIGN_UP(offset, __alignof(ptr)); CUresult r10 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(size_t)d_C; ALIGN_UP(offset, __alignof(ptr)); CUresult r11 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); ptr = (void*)(int)N; ALIGN_UP(offset, __alignof(ptr)); CUresult r11a = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr)); offset += sizeof(ptr); CUresult r12 = cuParamSetSize(vecAdd, offset); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; CUresult r13 = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); CUresult r14 = cuLaunchGrid(vecAdd, blocksPerGrid, 1); // Copy result from device memory to host memory // h_C contains the result in host memory CUresult r15 = cuMemcpyDtoH(h_C, d_C, size); for (int i = 0; i < N; ++i) { printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]); } // Free device memory cuMemFree(d_A); cuMemFree(d_B); cuMemFree(d_C); }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int offset=0; void *ptr=0; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { if(AllocateResources(numb,numt)==true) { // clear out any existing error CUresult err=CUDA_SUCCESS; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==CUDA_SUCCESS; it+=(numb*numt)) { cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); offset=0; int loops=64; int bits=5; ptr=(void *)(size_t)m_devin; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ptr=(void *)(size_t)m_devout; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ALIGN_UP(offset, __alignof(loops)); cuParamSeti(m_function,offset,loops); offset+=sizeof(loops); ALIGN_UP(offset, __alignof(bits)); cuParamSeti(m_function,offset,bits); offset+=sizeof(bits); cuParamSetSize(m_function,offset); err=cuFuncSetBlockShape(m_function,numt,1,1); if(err!=CUDA_SUCCESS) { printf("cuFuncSetBlockShape error %d\n",err); continue; } err=cuLaunchGrid(m_function,numb,1); if(err!=CUDA_SUCCESS) { printf("cuLaunchGrid error %d\n",err); continue; } cuMemcpyDtoH(m_out,m_devout,numt*numb*sizeof(cuda_out)); if(err!=CUDA_SUCCESS) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==CUDA_SUCCESS) { bestb=numb; bestt=numt; besttime=et-st; } } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
/************************************************* * HOST DRIVERS *************************************************/ void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) { unsigned int maxthreads = MAXTHREADS_STREAM; int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * BLOCK_DIM1D; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * BLOCK_DIM1D; else size = maxthreads * BLOCK_DIM1D; int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
int main() { const int N = 9; int bytes = sizeof(int) * N; int *A_cpu = 0; CUdeviceptr A_gpu = 0; A_cpu = new int[N]; for (int i = 0; i < N; i++) { A_cpu[i] = -1; } CUresult result = cuInit(0); if (result != CUDA_SUCCESS) { report("cuInit() failed: " << result); return 1; } int driverVersion = 0; result = cuDriverGetVersion(&driverVersion); if (result != CUDA_SUCCESS) { report("cuDriverGetVersion() failed: " << result); } int count = 0; result = cuDeviceGetCount(&count); if (result != CUDA_SUCCESS) { report("cuDeviceGetCount() failed: " << result); return 1; } CUdevice device; result = cuDeviceGet(&device, 0); if (result != CUDA_SUCCESS) { report("cuDeviceGet() failed: " << result); return 1; } char devName[256] = {0}; result = cuDeviceGetName(devName, 255, device); if (result != CUDA_SUCCESS) { report("cuDeviceGetName() failed: " << result); return 1; } int major, minor; result = cuDeviceComputeCapability(&major, &minor, device); if (result != CUDA_SUCCESS) { report("cuDeviceComputeCapability() failed: " << result); return 1; } CUcontext ctx; CUmodule module; CUfunction function; result = cuCtxCreate(&ctx, 0, device); if (result != CUDA_SUCCESS) { report("cuCtxCreate() failed: " << result); return 1; } int pi = 0; result = cuDeviceGetAttribute(&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device); if (result != CUDA_SUCCESS) { report("cuDeviceGetAttribute() failed: " << result); } result = cuModuleLoad(&module, "ocelot/cuda/test/driver/generic.ptx"); if (result != CUDA_SUCCESS) { report("cuModuleLoad() failed: " << result); return 1; } result = cuModuleGetFunction(&function, module, "genericmemory"); if (result != CUDA_SUCCESS) { report("cuModuleGetFunction() failed: " << result); return 1; } result = cuMemAlloc(&A_gpu, bytes); if (result != CUDA_SUCCESS) { report("cuMemAlloc() failed: " << result); return 1; } result = cuMemcpyHtoD(A_gpu, A_cpu, bytes); if (result != CUDA_SUCCESS) { report("cuMemcpyHtoD() failed: " << result); return 1; } struct { int *A; } parameters; result = cuParamSetSize(function, sizeof(parameters)); if (result != CUDA_SUCCESS) { report("cuParamSetSize() failed: " << result); return 1; } parameters.A = reinterpret_cast<int *>(A_gpu); result = cuParamSetv(function, 0, ¶meters.A, sizeof(parameters.A)); if (result != CUDA_SUCCESS) { report("cuParamSetv() failed: " << result); return 1; } result = cuFuncSetBlockShape(function, 1, 1, 1); if (result != CUDA_SUCCESS) { report("cuFuncSetBlockShape() failed: " << result); return 1; } result = cuLaunchGrid(function, 1, 1); if (result != CUDA_SUCCESS) { report("cuLaunchGrid() failed: " << result); return 1; } result = cuMemcpyDtoH(A_cpu, A_gpu, bytes); if (result != CUDA_SUCCESS) { report("cuMemcpyDtoH() failed: " << result); return 1; } cuModuleUnload(module); cuCtxDestroy(ctx); int errors = 0; for (int i = 0; i < 9; i++) { if (i < 3 && !A_cpu[i] || i >= 3 && A_cpu[i]) { ++errors; std::cout << "%p" << i << " - " << A_cpu[i] << "\n"; } } delete [] A_cpu; std::cout << "Pass/Fail : " << (!errors ? "Pass" : "Fail") << std::endl; return 0; }