void CudaModule::launchKernel(CUfunction kernel, const Vec2i& blockSize, const Vec2i& gridSize, bool async, CUstream stream) { if (!kernel) { fail("CudaModule: No kernel specified!"); } #if (CUDA_VERSION >= 3000) if (NULL != cuFuncSetCacheConfig) { CUfunc_cache cache = (s_preferL1)? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED; checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig( kernel, cache) ); } #endif updateGlobals(); updateTexRefs(kernel); checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(kernel, blockSize.x, blockSize.y, 1)); if (async && (NULL != cuLaunchGridAsync)) { checkError("cuLaunchGridAsync", cuLaunchGridAsync(kernel, gridSize.x, gridSize.y, stream)); } else { checkError("cuLaunchGrid", cuLaunchGrid(kernel, gridSize.x, gridSize.y)); } }
void Function::launch(int gridWidth, int gridHeight, const Stream &stream) const { //std::cout << "DOIN IT" << std::endl; detail::error_check( cuLaunchGridAsync(impl->func, gridWidth, gridHeight, stream.impl->stream), "Can't launch asynchronous Cuda function grid"); }
//host driver void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) { //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imgSize); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(numRegionsY); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
/* Driver */ void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) { /* Each thread block computes a linear block of the target */ int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(n); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(m); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
/************************************************* * HOST DRIVERS *************************************************/ void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) { unsigned int maxthreads = MAXTHREADS_STREAM; int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * BLOCK_DIM1D; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * BLOCK_DIM1D; else size = maxthreads * BLOCK_DIM1D; int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
//host driver //void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) { void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){ //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); unsigned int maxthreads = 65000; // Set threads per block here. unsigned int blocksdim1d = 256; dim3 threads(blocksdim1d, 1, 1); int nstreams = iDivUp(N, maxthreads*blocksdim1d); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * blocksdim1d; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * blocksdim1d; else size = maxthreads * blocksdim1d; int gridx = iDivUp(size, blocksdim1d); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); ALIGN_UP(poffset, __alignof(imx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imx); ALIGN_UP(poffset, __alignof(imy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imy); ALIGN_UP(poffset, __alignof(outx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outx); ALIGN_UP(poffset, __alignof(outy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outy); ALIGN_UP(poffset, __alignof(poolx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(poolx); ALIGN_UP(poffset, __alignof(pooly)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(pooly); // if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { // mexErrMsgTxt("Error in cuParamSeti"); // } // poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if CUDA_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); cutilDrvSafeCall(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status); return status; } return status; }
void swanRunKernelAsync( const char *kernel, block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types ) { // find the kernel if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0 CUfunction f = NULL; int i; int offset = 0; CUresult err; int type; int idx=0; try_init(); for( i=0; i < state.num_funcs; i++ ) { if( !strcmp( state.func_names[i], kernel ) ) { f = state.funcs[i]; break; } } if( f == NULL ) { for( i=0; i < state.num_mods; i++ ) { cuModuleGetFunction( &f, state.mods[i], kernel ); if( f!= NULL ) { // found a kernel. store it for future use int j = state.num_funcs; state.num_funcs++; state.funcs = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs ); state.funcs[j] = f; state.func_names = (char**) realloc( state.func_names, sizeof(char*) * state.num_funcs ); state.func_names[j] = (char*) malloc( strlen(kernel) + 1 ); strcpy( state.func_names[j], kernel ); break; } } } if( f== NULL ) { fprintf(stderr, "Error running kernel [%s] : \n", kernel ); error( "No kernel found" ); } if( grid.z != 1 ) { printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z ); error( "grid.z needs to be 1" ); } //printf("Running kernel [%s]\n", kernel ); type = types[idx]; while( type != SWAN_END ) { void *ptr = ptrs[idx]; switch( type ) { // DEBLOCK( SWAN_uchar, uchar, 1 ); DEBLOCK( SWAN_uchar2, uchar2, 2 ); DEBLOCK( SWAN_uchar3, uchar3, 1 ); DEBLOCK( SWAN_uchar4, uchar4, 4 ); DEBLOCK( SWAN_char , int, 1 ); // DEBLOCK( SWAN_char1 , char1, 1 ); DEBLOCK( SWAN_char2 , char2, 2 ); DEBLOCK( SWAN_char3 , char3, 1 ); DEBLOCK( SWAN_char4 , char4, 4 ); DEBLOCK( SWAN_int, int, 4 ); // DEBLOCK( SWAN_int1, int1, 4 ); DEBLOCK( SWAN_int2, int2, 8 ); DEBLOCK( SWAN_int3, int3, 4 ); DEBLOCK( SWAN_int4, int4, 16 ); // DEBLOCK( SWAN_float, double, 4 ); // DEBLOCK( SWAN_float1, float1, 4 ); DEBLOCK( SWAN_float2, float2, 8 ); DEBLOCK( SWAN_float3, float3, 4 ); DEBLOCK( SWAN_float4, float4, 16 ); DEBLOCK( SWAN_uint, uint, 4 ); DEBLOCK( SWAN_uint2, uint2, 8 ); DEBLOCK( SWAN_uint3, uint3, 4 ); DEBLOCK( SWAN_uint4, uint4, 16 ); DEBLOCK( SWAN_float, float, 4 ); //#define DEBLOCK(swan_type,type,OFFSET) #if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4 case SWAN_PTR: { //printf("PTR as NATIVE\n"); ALIGN_UP( offset, (sizeof(void*))); cuParamSetv( f, offset, ptr, sizeof(void*) ); offset += sizeof(void*); } break; #else case SWAN_PTR: { //printf("PTR as INT\n"); ALIGN_UP( offset, (sizeof(int))); cuParamSetv( f, offset, ptr, sizeof(int) ); offset += sizeof(int); } break; #endif default: printf("%d\n", type ); error("Parameter type not handled\n"); } idx++; type = types[idx]; } //printf("Launching kernel [%s] [%X] with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z ); //printf(" TOTAL OFFSET %d\n", offset ); CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) ); CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) ); #if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4 cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd #endif err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ; if( err != CUDA_SUCCESS ) { fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z ); assert(0); exit(-99); } //printf("Kernel completed\n" ); }