Esempio n. 1
0
void CudaModule::launchKernel(CUfunction kernel, const Vec2i& blockSize, 
                              const Vec2i& gridSize, bool async, 
                              CUstream stream)
{
  if (!kernel) {
    fail("CudaModule: No kernel specified!");
  }

#if (CUDA_VERSION >= 3000)
  if (NULL != cuFuncSetCacheConfig)
  {
    CUfunc_cache cache = (s_preferL1)? CU_FUNC_CACHE_PREFER_L1 : 
                                       CU_FUNC_CACHE_PREFER_SHARED;  
    checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig( kernel, cache) );
  }
#endif

  updateGlobals();
  updateTexRefs(kernel);
  checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(kernel, blockSize.x, blockSize.y, 1));

  if (async && (NULL != cuLaunchGridAsync)) 
  {
    checkError("cuLaunchGridAsync", 
                cuLaunchGridAsync(kernel, gridSize.x, gridSize.y, stream));
  } 
  else 
  {
    checkError("cuLaunchGrid", 
                cuLaunchGrid(kernel, gridSize.x, gridSize.y));
  }
}
Esempio n. 2
0
	void Function::launch(int gridWidth, int gridHeight, const Stream &stream) const
	{
    //std::cout << "DOIN IT" << std::endl;
		detail::error_check(
			cuLaunchGridAsync(impl->func, gridWidth, gridHeight, stream.impl->stream),
			"Can't launch asynchronous Cuda function grid");
	}
Esempio n. 3
0
//host driver
void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) {

  //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);

  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }


  //mexPrintf("block shape ok\n");

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(imgSize);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(numRegionsY);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(shmemX);

  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
  
}
Esempio n. 4
0
/* Driver */
void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) {

  /* Each thread block computes a linear block of the target */
  int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh
  
  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  // offset: used for streams

  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(n);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(m);


  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
}
Esempio n. 5
0
/*************************************************
 * HOST DRIVERS
 *************************************************/
void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) {


  unsigned int maxthreads = MAXTHREADS_STREAM;
  int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D);
  CUresult err = CUDA_SUCCESS;
  for (int str = 0; str < nstreams; str++) {
    int offset = str * maxthreads * BLOCK_DIM1D;
    int size = 0;
    if (str == (nstreams - 1))
      size = N - str * maxthreads * BLOCK_DIM1D;
    else
      size = maxthreads * BLOCK_DIM1D;


    int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks

    // setup execution parameters

    if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
      mexErrMsgTxt("Error in cuFuncSetBlockShape");
    }

    if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
      mexErrMsgTxt("Error in cuFuncSetSharedSize");
    }


    // add parameters
    int poffset = 0;

    // CUDA kernels interface
    // N: number of elements
    // offset: used for streams
    ALIGN_UP(poffset, __alignof(size));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(size);

    ALIGN_UP(poffset, __alignof(offset));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(offset);

    for (int p=0;p<nrhs;p++) {
      ALIGN_UP(poffset, prhs[p].align);
      if (CUDA_SUCCESS
          != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
        mexErrMsgTxt("Error in cuParamSetv");
      }
      poffset += prhs[p].psize;
    }

    if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
      mexErrMsgTxt("Error in cuParamSetSize");
    }

    err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
    if (CUDA_SUCCESS != err) {
      mexErrMsgTxt("Error running kernel");
    }
  }

}
Esempio n. 6
0
//host driver
//void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) {
void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){
    //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);
    
    
    unsigned int maxthreads = 65000;
    // Set threads per block here.
        unsigned int blocksdim1d = 256;
    dim3 threads(blocksdim1d, 1, 1);
    int nstreams = iDivUp(N, maxthreads*blocksdim1d);
    CUresult err = CUDA_SUCCESS;
    for (int str = 0; str < nstreams; str++) {
        int offset = str * maxthreads * blocksdim1d;
        int size = 0;
        if (str == (nstreams - 1))
            size = N - str * maxthreads * blocksdim1d;
        else
            size = maxthreads * blocksdim1d;
        
        
        int gridx = iDivUp(size, blocksdim1d); // number of x blocks
        
        // setup execution parameters
        
        if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) {
            mexErrMsgTxt("Error in cuFuncSetBlockShape");
        }
        
        if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
            mexErrMsgTxt("Error in cuFuncSetSharedSize");
        }
        
        //mexPrintf("block shape ok\n");
        
        // add parameters
        int poffset = 0;
        
        // CUDA kernels interface
        // N: number of elements
        for (int p=0;p<nrhs;p++) {
            ALIGN_UP(poffset, prhs[p].align);
            if (CUDA_SUCCESS
                    != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
                mexErrMsgTxt("Error in cuParamSetv");
            }
            poffset += prhs[p].psize;
        }
        
        ALIGN_UP(poffset, __alignof(size));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(size);
        
        ALIGN_UP(poffset, __alignof(offset));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(offset);
        
        ALIGN_UP(poffset, __alignof(imx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imx);
        
        ALIGN_UP(poffset, __alignof(imy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imy);
        
        ALIGN_UP(poffset, __alignof(outx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outx);
        
        ALIGN_UP(poffset, __alignof(outy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outy);
        
        ALIGN_UP(poffset, __alignof(poolx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(poolx);
        
        ALIGN_UP(poffset, __alignof(pooly));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(pooly);
        
//   if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
//     mexErrMsgTxt("Error in cuParamSeti");
//   }
//   poffset += sizeof(shmemX);
        
        if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
            mexErrMsgTxt("Error in cuParamSetSize");
        }
        
        err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
        if (CUDA_SUCCESS != err) {
            mexErrMsgTxt("Error running kernel");
        }
        
    }
}
CUresult  cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch,
                                  CUdeviceptr d_dstARGB, size_t nDestPitch,
                                  uint32 width,          uint32 height,
                                  CUfunction fpFunc, CUstream streamID)
{
    CUresult status;
    // Each thread will output 2 pixels at a time.  The grid size width is half
    // as large because of this
    dim3 block(32,16,1);
    dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);

#if CUDA_VERSION >= 4000
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
    void *args[] = { &d_srcNV12, &nSourcePitch,
                     &d_dstARGB, &nDestPitch,
                     &width, &height
                   };

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
                            block.x, block.y, block.z,
                            0, streamID,
                            args, NULL);
#else
    // This is the older Driver API launch method from CUDA (V1.0 to V3.2)
    cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
    int offset = 0;

    // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers

    // device pointer for Source Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12,    sizeof(d_srcNV12)));
    offset += sizeof(d_srcNV12);

    // set the Source pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
    offset += sizeof(nSourcePitch);

    // device pointer for Destination Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB,    sizeof(d_dstARGB)));
    offset += sizeof(d_dstARGB);

    //  set the Destination Pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch,   sizeof(nDestPitch)));
    offset += sizeof(nDestPitch);

    // set the width of the image
    ALIGN_OFFSET(offset, __alignof(width));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width));
    offset += sizeof(width);

    // set the height of the image
    ALIGN_OFFSET(offset, __alignof(height));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height));
    offset += sizeof(height);

    cutilDrvSafeCall(cuParamSetSize(fpFunc, offset));

    // Launching the kernel, we need to pass in the grid dimensions
    status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif

    if (CUDA_SUCCESS != status)
    {
        fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status);
        return status;
    }

    return status;
}
Esempio n. 8
0
void swanRunKernelAsync( const char *kernel,  block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types  ) {
	// find the kernel

	if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0

	CUfunction f = NULL;
	int i;
	int offset = 0;
	CUresult err;

	int type;
	int idx=0;
	try_init();
	for( i=0; i < state.num_funcs; i++ ) {
		if( !strcmp( state.func_names[i], kernel ) ) {
			f = state.funcs[i];
			break;
		}
	}

	if( f == NULL ) {
		for( i=0; i < state.num_mods; i++ ) {
			cuModuleGetFunction( &f, state.mods[i], kernel );
			if( f!= NULL ) { 
				// found a kernel. store it for future use
				int j = state.num_funcs;
				state.num_funcs++;
				state.funcs      = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs );
				state.funcs[j]   = f;
				state.func_names = (char**)      realloc( state.func_names, sizeof(char*) * state.num_funcs );
				state.func_names[j] = (char*) malloc( strlen(kernel) + 1 );
				strcpy( state.func_names[j], kernel );
				break; 
			}
		}
	}

	if( f== NULL ) {
		fprintf(stderr, "Error running kernel [%s] : \n", kernel );
		error( "No kernel found" );
	}

	if( grid.z != 1 ) {
		printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z );
		error( "grid.z needs to be 1" );
	}

//printf("Running kernel [%s]\n", kernel );

	type = types[idx];
	while( type != SWAN_END ) {
		void *ptr = ptrs[idx];
		switch( type ) {
//			DEBLOCK( SWAN_uchar, uchar,  1 );
			DEBLOCK( SWAN_uchar2, uchar2,  2 );
			DEBLOCK( SWAN_uchar3, uchar3,  1 );
			DEBLOCK( SWAN_uchar4, uchar4,  4 );
			DEBLOCK( SWAN_char , int,  1 );
//			DEBLOCK( SWAN_char1 , char1,  1 );
			DEBLOCK( SWAN_char2 , char2,  2 );
			DEBLOCK( SWAN_char3 , char3,  1 );
			DEBLOCK( SWAN_char4 , char4,  4 );
			DEBLOCK( SWAN_int, int,  4 );
//			DEBLOCK( SWAN_int1, int1,  4 );
			DEBLOCK( SWAN_int2, int2,  8 );
			DEBLOCK( SWAN_int3, int3,  4 );
			DEBLOCK( SWAN_int4, int4,  16 );
//			DEBLOCK( SWAN_float, double,  4 );
//			DEBLOCK( SWAN_float1, float1,  4 );
			DEBLOCK( SWAN_float2, float2,  8 );
			DEBLOCK( SWAN_float3, float3,  4 );
			DEBLOCK( SWAN_float4, float4,  16 );

			DEBLOCK( SWAN_uint, uint,  4 );
			DEBLOCK( SWAN_uint2, uint2,  8 );
			DEBLOCK( SWAN_uint3, uint3,  4 );
			DEBLOCK( SWAN_uint4, uint4,  16 );
			DEBLOCK( SWAN_float, float,  4 );


//#define DEBLOCK(swan_type,type,OFFSET) 
#if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4
			case SWAN_PTR: 
				{
//printf("PTR as NATIVE\n");
				ALIGN_UP( offset, (sizeof(void*)));
				cuParamSetv( f, offset, ptr, sizeof(void*) );
				offset += sizeof(void*); }
			break;
#else
			case SWAN_PTR: 
				{
//printf("PTR as INT\n");
				ALIGN_UP( offset, (sizeof(int)));
				cuParamSetv( f, offset, ptr, sizeof(int) );
				offset += sizeof(int); }
			break;
#endif



			default:
        printf("%d\n", type );
				error("Parameter type not handled\n");


		}
		idx++;
		type = types[idx];
	}

//printf("Launching kernel [%s] [%X]  with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z );
//printf(" TOTAL OFFSET %d\n", offset );
	CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) );
#if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4
	cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd
#endif

	err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ;

	if( err != CUDA_SUCCESS ) {
		fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z );
	assert(0);
		exit(-99);
	}

//printf("Kernel completed\n" );
}