//host driver void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) { //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imgSize); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(numRegionsY); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
/* * Initializaiton in order to use kernel program */ void init_cuda(void){ thread_num = (N <= 16) ? N : 16 ; block_num = N / (thread_num*thread_num); if(N % (thread_num*thread_num) != 0) block_num++; res = cuInit(0); if(res != CUDA_SUCCESS){ printf("cuInit failed: res = %s\n", conv(res)); exit(1); } res = cuDeviceGet(&dev, 0); if(res != CUDA_SUCCESS){ printf("cuDeviceGet failed: res = %s\n", conv(res)); exit(1); } res = cuCtxCreate(&ctx, 0, dev); if(res != CUDA_SUCCESS){ printf("cuCtxCreate failed: res = %s\n", conv(res)); exit(1); } res = cuModuleLoad(&module, "./cuda_main.cubin"); if(res != CUDA_SUCCESS){ printf("cuModuleLoad() failed: res = %s\n", conv(res)); exit(1); } res = cuModuleGetFunction(&function, module, "cuda_main"); if(res != CUDA_SUCCESS){ printf("cuModuleGetFunction() failed: res = %s\n", conv(res)); exit(1); } /* * preparation for launch kernel */ res = cuFuncSetSharedSize(function, 0x40); /* just random */ if(res != CUDA_SUCCESS){ printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res)); exit(1); } res = cuFuncSetBlockShape(function, thread_num, thread_num, 1); if(res != CUDA_SUCCESS){ printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res)); exit(1); } }
SEXP R_auto_cuFuncSetSharedSize(SEXP r_hfunc, SEXP r_bytes) { SEXP r_ans = R_NilValue; CUfunction hfunc = (CUfunction) getRReference(r_hfunc); unsigned int bytes = REAL(r_bytes)[0]; CUresult ans; ans = cuFuncSetSharedSize(hfunc, bytes); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
/* Driver */ void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) { /* Each thread block computes a linear block of the target */ int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(n); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(m); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
/** * Invokes the kernel @f on a @gridDimX x @gridDimY x @gridDimZ grid of blocks. * Each block contains @blockDimX x @blockDimY x @blockDimZ threads. * @sharedMemBytes sets the amount of dynamic shared memory that will be * available to each thread block. * * cuLaunchKernel() can optionally be associated to a stream by passing a * non-zero hStream argument. * * Kernel parameters to @f can be specified in one of two ways: * * 1) Kernel parameters can be specified via kernelParams. If f has N * parameters, then kernelParams needs to be an array of N pointers. Each of * kernelParams[0] through kernelParams[N-1] must point to a region of memory * from which the actual kernel parameter will be copied. The number of kernel * parameters and their offsets and sizes do not need to be specified as that * information is retrieved directly from the kernel's image. * * 2) Kernel parameters can also be packaged by the application into a single * buffer that is passed in via the extra parameter. This places the burden on * the application of knowing each kernel parameter's size and alignment/ * padding within the buffer. Here is an example of using the extra parameter * in this manner: * * size_t argBufferSize; * char argBuffer[256]; * * // populate argBuffer and argBufferSize * * void *config[] = { * CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, * CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, * CU_LAUNCH_PARAM_END * }; * status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); * * The extra parameter exists to allow cuLaunchKernel to take additional less * commonly used arguments. extra specifies a list of names of extra settings * and their corresponding values. Each extra setting name is immediately * followed by the corresponding value. The list must be terminated with * either NULL or CU_LAUNCH_PARAM_END. * * CU_LAUNCH_PARAM_END, which indicates the end of the extra array; * CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next value in * extra will be a pointer to a buffer containing all the kernel parameters * for launching kernel f; * CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra * will be a pointer to a size_t containing the size of the buffer specified * with CU_LAUNCH_PARAM_BUFFER_POINTER; * * The error CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters * are specified with both kernelParams and extra (i.e. both kernelParams and * extra are non-NULL). * * Calling cuLaunchKernel() sets persistent function state that is the same as * function state set through the following deprecated APIs: * * cuFuncSetBlockShape() cuFuncSetSharedSize() cuParamSetSize() cuParamSeti() * cuParamSetf() cuParamSetv() * * When the kernel @f is launched via cuLaunchKernel(), the previous block * shape, shared size and parameter info associated with @f is overwritten. * * Note that to use cuLaunchKernel(), the kernel @f must either have been * compiled with toolchain version 3.2 or later so that it will contain kernel * parameter information, or have no kernel parameters. If either of these * conditions is not met, then cuLaunchKernel() will return * CUDA_ERROR_INVALID_IMAGE. * * Parameters: * f - Kernel to launch * gridDimX - Width of grid in blocks * gridDimY - Height of grid in blocks * gridDimZ - Depth of grid in blocks * blockDimX - X dimension of each thread block * blockDimY - Y dimension of each thread block * blockDimZ - Z dimension of each thread block * sharedMemBytes - Dynamic shared-memory size per thread block in bytes * hStream - Stream identifier * kernelParams - Array of pointers to kernel parameters * extra - Extra options * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_HANDLE, * CUDA_ERROR_INVALID_IMAGE, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_LAUNCH_FAILED, * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, CUDA_ERROR_LAUNCH_TIMEOUT, * CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * CUDA_ERROR_SHARED_OBJECT_INIT_FAILED */ CUresult cuLaunchKernel (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra) { struct gdev_cuda_raw_func *rf; CUresult res; int i; if (hStream) { GDEV_PRINT("cuLaunchKernel: Stream is not supported.\n"); return CUDA_ERROR_INVALID_HANDLE; } if (extra) { GDEV_PRINT("cuLaunchKernel: Extra Parameters are not supported.\n"); return CUDA_ERROR_INVALID_HANDLE; } res = cuFuncSetSharedSize(f, sharedMemBytes); if (res != CUDA_SUCCESS) return res; res = cuFuncSetBlockShape(f, blockDimX, blockDimY, blockDimZ); if (res != CUDA_SUCCESS) return res; rf = &f->raw_func; for (i = 0; i < rf->param_count; i++) { void *p = kernelParams[i]; int offset = rf->param_info[i].offset; uint32_t size = rf->param_info[i].size; cuParamSetv(f, offset, p, size); } res = cuParamSetSize(f, rf->param_size); if (res != CUDA_SUCCESS) return res; res = cuLaunchGrid(f, gridDimX, gridDimY); if (res != CUDA_SUCCESS) return res; return CUDA_SUCCESS; }
/************************************************* * HOST DRIVERS *************************************************/ void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) { unsigned int maxthreads = MAXTHREADS_STREAM; int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * BLOCK_DIM1D; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * BLOCK_DIM1D; else size = maxthreads * BLOCK_DIM1D; int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
//host driver //void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) { void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){ //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); unsigned int maxthreads = 65000; // Set threads per block here. unsigned int blocksdim1d = 256; dim3 threads(blocksdim1d, 1, 1); int nstreams = iDivUp(N, maxthreads*blocksdim1d); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * blocksdim1d; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * blocksdim1d; else size = maxthreads * blocksdim1d; int gridx = iDivUp(size, blocksdim1d); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); ALIGN_UP(poffset, __alignof(imx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imx); ALIGN_UP(poffset, __alignof(imy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imy); ALIGN_UP(poffset, __alignof(outx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outx); ALIGN_UP(poffset, __alignof(outy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outy); ALIGN_UP(poffset, __alignof(poolx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(poolx); ALIGN_UP(poffset, __alignof(pooly)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(pooly); // if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { // mexErrMsgTxt("Error in cuParamSeti"); // } // poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
int cuda_test_fmadd(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmadd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
void swanRunKernelAsync( const char *kernel, block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types ) { // find the kernel if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0 CUfunction f = NULL; int i; int offset = 0; CUresult err; int type; int idx=0; try_init(); for( i=0; i < state.num_funcs; i++ ) { if( !strcmp( state.func_names[i], kernel ) ) { f = state.funcs[i]; break; } } if( f == NULL ) { for( i=0; i < state.num_mods; i++ ) { cuModuleGetFunction( &f, state.mods[i], kernel ); if( f!= NULL ) { // found a kernel. store it for future use int j = state.num_funcs; state.num_funcs++; state.funcs = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs ); state.funcs[j] = f; state.func_names = (char**) realloc( state.func_names, sizeof(char*) * state.num_funcs ); state.func_names[j] = (char*) malloc( strlen(kernel) + 1 ); strcpy( state.func_names[j], kernel ); break; } } } if( f== NULL ) { fprintf(stderr, "Error running kernel [%s] : \n", kernel ); error( "No kernel found" ); } if( grid.z != 1 ) { printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z ); error( "grid.z needs to be 1" ); } //printf("Running kernel [%s]\n", kernel ); type = types[idx]; while( type != SWAN_END ) { void *ptr = ptrs[idx]; switch( type ) { // DEBLOCK( SWAN_uchar, uchar, 1 ); DEBLOCK( SWAN_uchar2, uchar2, 2 ); DEBLOCK( SWAN_uchar3, uchar3, 1 ); DEBLOCK( SWAN_uchar4, uchar4, 4 ); DEBLOCK( SWAN_char , int, 1 ); // DEBLOCK( SWAN_char1 , char1, 1 ); DEBLOCK( SWAN_char2 , char2, 2 ); DEBLOCK( SWAN_char3 , char3, 1 ); DEBLOCK( SWAN_char4 , char4, 4 ); DEBLOCK( SWAN_int, int, 4 ); // DEBLOCK( SWAN_int1, int1, 4 ); DEBLOCK( SWAN_int2, int2, 8 ); DEBLOCK( SWAN_int3, int3, 4 ); DEBLOCK( SWAN_int4, int4, 16 ); // DEBLOCK( SWAN_float, double, 4 ); // DEBLOCK( SWAN_float1, float1, 4 ); DEBLOCK( SWAN_float2, float2, 8 ); DEBLOCK( SWAN_float3, float3, 4 ); DEBLOCK( SWAN_float4, float4, 16 ); DEBLOCK( SWAN_uint, uint, 4 ); DEBLOCK( SWAN_uint2, uint2, 8 ); DEBLOCK( SWAN_uint3, uint3, 4 ); DEBLOCK( SWAN_uint4, uint4, 16 ); DEBLOCK( SWAN_float, float, 4 ); //#define DEBLOCK(swan_type,type,OFFSET) #if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4 case SWAN_PTR: { //printf("PTR as NATIVE\n"); ALIGN_UP( offset, (sizeof(void*))); cuParamSetv( f, offset, ptr, sizeof(void*) ); offset += sizeof(void*); } break; #else case SWAN_PTR: { //printf("PTR as INT\n"); ALIGN_UP( offset, (sizeof(int))); cuParamSetv( f, offset, ptr, sizeof(int) ); offset += sizeof(int); } break; #endif default: printf("%d\n", type ); error("Parameter type not handled\n"); } idx++; type = types[idx]; } //printf("Launching kernel [%s] [%X] with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z ); //printf(" TOTAL OFFSET %d\n", offset ); CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) ); #if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4 cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd #endif err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ; if( err != CUDA_SUCCESS ) { fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z ); assert(0); exit(-99); } //printf("Kernel completed\n" ); }
void Function::setSharedSize(unsigned int bytes) const { detail::error_check(cuFuncSetSharedSize(impl->func, bytes), "Can't set Cuda function shared memory size"); }