void CudaModule::launchKernel(CUfunction kernel, const Vec2i& blockSize, 
                              const Vec2i& gridSize, bool async, 
                              CUstream stream)
{
  if (!kernel) {
    fail("CudaModule: No kernel specified!");
  }

#if (CUDA_VERSION >= 3000)
  if (NULL != cuFuncSetCacheConfig)
  {
    CUfunc_cache cache = (s_preferL1)? CU_FUNC_CACHE_PREFER_L1 : 
                                       CU_FUNC_CACHE_PREFER_SHARED;  
    checkError("cuFuncSetCacheConfig", cuFuncSetCacheConfig( kernel, cache) );
  }
#endif

  updateGlobals();
  updateTexRefs(kernel);
  checkError("cuFuncSetBlockShape", cuFuncSetBlockShape(kernel, blockSize.x, blockSize.y, 1));

  if (async && (NULL != cuLaunchGridAsync)) 
  {
    checkError("cuLaunchGridAsync", 
                cuLaunchGridAsync(kernel, gridSize.x, gridSize.y, stream));
  } 
  else 
  {
    checkError("cuLaunchGrid", 
                cuLaunchGrid(kernel, gridSize.x, gridSize.y));
  }
}
const unsigned long CUDARunner::RunStep()
{
	//unsigned int best=0;
	//unsigned int bestg=~0;
	int offset=0;

	if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0)
	{
		AllocateResources(m_numb,m_numt);
	}
	m_out[0].m_bestnonce=0;
	cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out));

	cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));

	int loops=GetStepIterations();
	int bits=GetStepBitShift()-1;

	void *ptr=(void *)(size_t)m_devin;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ptr=(void *)(size_t)m_devout;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ALIGN_UP(offset, __alignof(loops));
	cuParamSeti(m_function,offset,loops);
	offset+=sizeof(loops);

	ALIGN_UP(offset, __alignof(bits));
	cuParamSeti(m_function,offset,bits);
	offset+=sizeof(bits);

	cuParamSetSize(m_function,offset);

	cuFuncSetBlockShape(m_function,m_numt,1,1);
	cuLaunchGrid(m_function,m_numb,1);

	cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out));

	// very unlikely that we will find more than 1 hash with H=0
	// so we'll just return the first one and not even worry about G
	for(int i=0; i<1/*m_numb*m_numt*/; i++)
	{
		if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg)
		{
			return CryptoPP::ByteReverse(m_out[i].m_bestnonce);
			//best=m_out[i].m_bestnonce;
			//bestg=m_out[i].m_bestg;
		}
	}

	return 0;

}
Exemple #3
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    runBlocks
 * Signature: (I)V
 */
JNIEXPORT jint JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_runBlocks
  (JNIEnv *env, jobject this_obj, jint num_blocks, jint block_shape, jint grid_shape){

  CUresult status;
  jlong * infoSpace = (jlong *) malloc(gc_space_size);
  infoSpace[1] = heapEndPtr;
  cuMemcpyHtoD(gcInfoSpace, infoSpace, gc_space_size);
  cuMemcpyHtoD(gpuToSpace, toSpace, heapEndPtr);
  //cuMemcpyHtoD(gpuTexture, textureMemory, textureMemSize);
  cuMemcpyHtoD(gpuHandlesMemory, handlesMemory, num_blocks * sizeof(jlong));
  cuMemcpyHtoD(gpuHeapEndPtr, &heapEndPtr, sizeof(jlong));
  cuMemcpyHtoD(gpuBufferSize, &bufferSize, sizeof(jlong));
  
/*
  status = cuModuleGetTexRef(&cache, cuModule, "m_Cache");  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuModuleGetTexRef %d\n", status);
  }

  status = cuTexRefSetAddress(0, cache, gpuTexture, textureMemSize);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuTextRefSetAddress %d\n", status);
  }
*/

  status = cuFuncSetBlockShape(cuFunction, block_shape, 1, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuFuncSetBlockShape %d\n", status);
    return (jint) status;
  }

  status = cuLaunchGrid(cuFunction, grid_shape, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuLaunchGrid %d\n", status);
    fflush(stdout);
    return (jint) status;
  }

  status = cuCtxSynchronize();  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuCtxSynchronize %d\n", status);
    return (jint) status;
  }

  cuMemcpyDtoH(infoSpace, gcInfoSpace, gc_space_size);
  heapEndPtr = infoSpace[1];
  cuMemcpyDtoH(toSpace, gpuToSpace, heapEndPtr);
  cuMemcpyDtoH(exceptionsMemory, gpuExceptionsMemory, num_blocks * sizeof(jlong));
  free(infoSpace);
  
  return 0;
}
Exemple #4
0
//host driver
void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) {

  //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);

  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }


  //mexPrintf("block shape ok\n");

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(imgSize);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(numRegionsY);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(shmemX);

  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
  
}
Exemple #5
0
//----------------------------------------------------------------------------//
bool CUDAImpl::_LaunchKernel(Kernel & kernel,
                             const CUfunction & cudaKernel,
                             std::string * err)
{
    // Set CUDA kernel arguments
    CUresult c_err;
    int paramOffset = 0;
    for(size_t i = 0; i < kernel.inBuffers.size(); ++i) {
        c_err = cuParamSetv(cudaKernel, paramOffset,
                            &_cudaBuffers[kernel.inBuffers[i].buffer->name],
                            sizeof(void*));
        paramOffset += sizeof(void *);
    }
    for(size_t i = 0; i < kernel.outBuffers.size(); ++i) {
        c_err = cuParamSetv(cudaKernel, paramOffset,
                            &_cudaBuffers[kernel.outBuffers[i].buffer->name],
                            sizeof(void*));
        paramOffset += sizeof(void *);
    }
    for(size_t i = 0; i < kernel.paramsInt.size(); ++i) {
        c_err = cuParamSetv(cudaKernel, paramOffset,
                            &kernel.paramsInt[i].value, sizeof(int));
        paramOffset += sizeof(int);
    }
    for(size_t i = 0; i < kernel.paramsFloat.size(); ++i) {
        c_err = cuParamSetv(cudaKernel, paramOffset,
                            &kernel.paramsFloat[i].value, sizeof(float));
        paramOffset += sizeof(float);
    }
    // int and width parameters
    c_err = cuParamSetv(cudaKernel, paramOffset, &_w, sizeof(int));
    paramOffset += sizeof(int);
    c_err = cuParamSetv(cudaKernel, paramOffset, &_h, sizeof(int));
    paramOffset += sizeof(int);
    
    // It should be fine to check once all the arguments have been set
    if(_cudaErrorCheckParamSet(c_err, err, kernel.name)) {
        return false;
    }
    
    c_err = cuParamSetSize(cudaKernel, paramOffset);
    if (_cudaErrorParamSetSize(c_err, err, kernel.name)) {
        return false;
    }

    // Launch the CUDA kernel
    const int nBlocksHor = _w / 16 + 1;
    const int nBlocksVer = _h / 16 + 1;
    cuFuncSetBlockShape(cudaKernel, 16, 16, 1);
    c_err = cuLaunchGrid(cudaKernel, nBlocksHor, nBlocksVer);
    if (_cudaErrorLaunchKernel(c_err, err, kernel.name)) {
        return false;
    }
        
    return true;
}
Exemple #6
0
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
void GPUInterface::LaunchKernel(GPUFunction deviceFunction,
                                Dim3Int block,
                                Dim3Int grid,
                                int parameterCountV,
                                int totalParameterCount,
                                ...) { // parameters
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n");
#endif


    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z));

    int offset = 0;
    va_list parameters;
    va_start(parameters, totalParameterCount);
    for(int i = 0; i < parameterCountV; i++) {
        void* param = (void*)(size_t)va_arg(parameters, GPUPtr);

        // adjust offset alignment requirements
        offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);

        SAFE_CUDA(cuParamSetv(deviceFunction, offset, &param, sizeof(param)));

        offset += sizeof(void*);
    }
    for(int i = parameterCountV; i < totalParameterCount; i++) {
        unsigned int param = va_arg(parameters, unsigned int);

        // adjust offset alignment requirements
        offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);

        SAFE_CUDA(cuParamSeti(deviceFunction, offset, param));

        offset += sizeof(param);

    }

    va_end(parameters);

    SAFE_CUDA(cuParamSetSize(deviceFunction, offset));

    SAFE_CUDA(cuLaunchGrid(deviceFunction, grid.x, grid.y));

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::LaunchKernel\n");
#endif

}
Exemple #8
0
CAMLprim value spoc_cuda_set_block_shape(value ker, value block, value gi){
	CAMLparam3(ker, block, gi);
	CUfunction *kernel;

	CUDA_GET_CONTEXT;

	kernel = (CUfunction*) ker;
	CUDA_CHECK_CALL(cuFuncSetBlockShape(*kernel, Int_val(Field(block,0)),Int_val(Field(block,1)),Int_val(Field(block,2))));

	CUDA_RESTORE_CONTEXT;
	CAMLreturn(Val_unit);
}
CUresult loadAndRunDualTestFunction(CUmodule *phModule, std::string name, CUdeviceptr &d_data0, 
				CUdeviceptr &d_data1, 
				DataStruct *h_data0, 
				DataStruct *h_data1, 
				unsigned int memSize, 
                                int thread_x=1,int thread_y=1,int thread_z=1,
                                int block_x=1, int block_y=1, int block_z=1)
{
  //  std::cout << "  Start Loading" << std::endl;

  // load data the to device
  cuMemcpyHtoD(d_data0, h_data0, memSize);         
  cuMemcpyHtoD(d_data1, h_data1, memSize);         

  // Locate the kernel entry point
  CUfunction phKernel = 0;
  CUresult status = cuModuleGetFunction(&phKernel, *phModule, name.data());
   if (status != CUDA_SUCCESS)
     {printf("ERROR: could not load function\n");}
    
  // Set the kernel parameters
  status = cuFuncSetBlockShape(phKernel, thread_x, thread_y, thread_z);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during setBlockShape\n");}

  int paramOffset = 0, size=0;

  size = sizeof(CUdeviceptr);
  status = cuParamSetv(phKernel, paramOffset, &d_data0, size);
  paramOffset += size;
  status = cuParamSetv(phKernel, paramOffset, &d_data1, size);
  paramOffset += size;



  status = cuParamSetSize(phKernel, paramOffset);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during cuParamSetv\n");}
    
  // Launch the kernel
  status = cuLaunchGrid(phKernel, block_x, block_y);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during grid launch\n");}

  //  std::cout << "  launched CUDA kernel!!" << std::endl;
  
  // Copy the result back to the host
  status = cuMemcpyDtoH(h_data0, d_data0, memSize);
  status = cuMemcpyDtoH(h_data1, d_data1, memSize);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during MemcpyDtoH\n");}
}
Exemple #10
0
/* Driver */
void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) {

  /* Each thread block computes a linear block of the target */
  int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh
  
  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  // offset: used for streams

  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(n);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(m);


  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
}
Exemple #11
0
SEXP R_auto_cuFuncSetBlockShape(SEXP r_hfunc, SEXP r_x, SEXP r_y, SEXP r_z)
{
    SEXP r_ans = R_NilValue;
    CUfunction hfunc = (CUfunction) getRReference(r_hfunc);
    int x = INTEGER(r_x)[0];
    int y = INTEGER(r_y)[0];
    int z = INTEGER(r_z)[0];
    
    CUresult ans;
    ans = cuFuncSetBlockShape(hfunc, x, y, z);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Exemple #12
0
/**
 * Invokes the kernel @f on a @gridDimX x @gridDimY x @gridDimZ grid of blocks. 
 * Each block contains @blockDimX x @blockDimY x @blockDimZ threads.
 * @sharedMemBytes sets the amount of dynamic shared memory that will be 
 * available to each thread block.
 *
 * cuLaunchKernel() can optionally be associated to a stream by passing a 
 * non-zero hStream argument.
 *
 * Kernel parameters to @f can be specified in one of two ways:
 *
 * 1) Kernel parameters can be specified via kernelParams. If f has N 
 * parameters, then kernelParams needs to be an array of N pointers. Each of 
 * kernelParams[0] through kernelParams[N-1] must point to a region of memory 
 * from which the actual kernel parameter will be copied. The number of kernel 
 * parameters and their offsets and sizes do not need to be specified as that 
 * information is retrieved directly from the kernel's image.
 *
 * 2) Kernel parameters can also be packaged by the application into a single 
 * buffer that is passed in via the extra parameter. This places the burden on
 * the application of knowing each kernel parameter's size and alignment/
 * padding within the buffer. Here is an example of using the extra parameter 
 * in this manner:
 *
 *  size_t argBufferSize;
 *  char argBuffer[256];
 *
 *  // populate argBuffer and argBufferSize
 *
 *  void *config[] = {
 *      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
 *      CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
 *      CU_LAUNCH_PARAM_END
 *  };
 *  status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
 *
 * The extra parameter exists to allow cuLaunchKernel to take additional less 
 * commonly used arguments. extra specifies a list of names of extra settings 
 * and their corresponding values. Each extra setting name is immediately 
 * followed by the corresponding value. The list must be terminated with 
 * either NULL or CU_LAUNCH_PARAM_END.
 *
 *  CU_LAUNCH_PARAM_END, which indicates the end of the extra array;
 *  CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next value in 
 *  extra will be a pointer to a buffer containing all the kernel parameters 
 *  for launching kernel f;
 *  CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra
 *  will be a pointer to a size_t containing the size of the buffer specified 
 *  with CU_LAUNCH_PARAM_BUFFER_POINTER;
 *
 * The error CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters 
 * are specified with both kernelParams and extra (i.e. both kernelParams and 
 * extra are non-NULL).
 *
 * Calling cuLaunchKernel() sets persistent function state that is the same as 
 * function state set through the following deprecated APIs:
 *
 * cuFuncSetBlockShape() cuFuncSetSharedSize() cuParamSetSize() cuParamSeti() 
 * cuParamSetf() cuParamSetv()
 *
 * When the kernel @f is launched via cuLaunchKernel(), the previous block 
 * shape, shared size and parameter info associated with @f is overwritten.
 *
 * Note that to use cuLaunchKernel(), the kernel @f must either have been 
 * compiled with toolchain version 3.2 or later so that it will contain kernel 
 * parameter information, or have no kernel parameters. If either of these 
 * conditions is not met, then cuLaunchKernel() will return 
 * CUDA_ERROR_INVALID_IMAGE.
 *
 * Parameters:
 * f - Kernel to launch
 * gridDimX	- Width of grid in blocks
 * gridDimY - Height of grid in blocks
 * gridDimZ - Depth of grid in blocks
 * blockDimX - X dimension of each thread block
 * blockDimY - Y dimension of each thread block
 * blockDimZ - Z dimension of each thread block
 * sharedMemBytes - Dynamic shared-memory size per thread block in bytes
 * hStream - Stream identifier
 * kernelParams - Array of pointers to kernel parameters
 * extra - Extra options
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_HANDLE, 
 * CUDA_ERROR_INVALID_IMAGE, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_LAUNCH_FAILED,
 * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, CUDA_ERROR_LAUNCH_TIMEOUT, 
 * CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, 
 * CUDA_ERROR_SHARED_OBJECT_INIT_FAILED 
 */
CUresult cuLaunchKernel
(CUfunction f, 
 unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
 unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
 unsigned int sharedMemBytes, CUstream hStream, 
 void **kernelParams, void **extra)
{
	struct gdev_cuda_raw_func *rf;
	CUresult res;
	int i;

	if (hStream) {
		GDEV_PRINT("cuLaunchKernel: Stream is not supported.\n");
		return CUDA_ERROR_INVALID_HANDLE;
	}

	if (extra) {
		GDEV_PRINT("cuLaunchKernel: Extra Parameters are not supported.\n");
		return CUDA_ERROR_INVALID_HANDLE;
	}

	res = cuFuncSetSharedSize(f, sharedMemBytes);
	if (res != CUDA_SUCCESS)
		return res;

	res = cuFuncSetBlockShape(f, blockDimX, blockDimY, blockDimZ);
	if (res != CUDA_SUCCESS)
		return res;

	rf = &f->raw_func;
	for (i = 0; i < rf->param_count; i++) {
		void *p = kernelParams[i];
		int offset = rf->param_info[i].offset;
		uint32_t size = rf->param_info[i].size;
		cuParamSetv(f, offset, p, size);
	}

	res = cuParamSetSize(f, rf->param_size);
	if (res != CUDA_SUCCESS)
		return res;

	res = cuLaunchGrid(f, gridDimX, gridDimY);
	if (res != CUDA_SUCCESS)
		return res;

	return CUDA_SUCCESS;
}
int main(int argc, char ** argv)
{
	int dev_count = 0;

	CUdevice   device;
	CUcontext  context;
	CUmodule   module;
	CUfunction function;

	cuInit(0);

	cuDeviceGetCount(&dev_count);

	if (dev_count < 1) return -1;

	cuDeviceGet( &device, 0 );
	cuCtxCreate( &context, 0, device );
	
	cuModuleLoad( &module, "hello.cuda_runtime.ptx" );
	cuModuleGetFunction( &function, module, "_Z6kernelPf" );

	int N = 512;
	CUdeviceptr pData;
	cuMemAlloc( &pData, N * sizeof(float) );
	cuFuncSetBlockShape( function, N, 1, 1 );
	cuParamSeti( function, 0, pData );
	cuParamSetSize( function, 4 );

	cuLaunchGrid( function, 1, 1 );

	float * pHostData = new float[N];

	cuMemcpyDtoH( pHostData, pData, N * sizeof( float) );

	cuMemFree( pData );

	delete [] pHostData;

	return 0;
}
Exemple #14
0
int cuda_test_fmadd(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmadd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
Exemple #15
0
void swanRunKernelAsync( const char *kernel,  block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types  ) {
	// find the kernel

	if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0

	CUfunction f = NULL;
	int i;
	int offset = 0;
	CUresult err;

	int type;
	int idx=0;
	try_init();
	for( i=0; i < state.num_funcs; i++ ) {
		if( !strcmp( state.func_names[i], kernel ) ) {
			f = state.funcs[i];
			break;
		}
	}

	if( f == NULL ) {
		for( i=0; i < state.num_mods; i++ ) {
			cuModuleGetFunction( &f, state.mods[i], kernel );
			if( f!= NULL ) { 
				// found a kernel. store it for future use
				int j = state.num_funcs;
				state.num_funcs++;
				state.funcs      = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs );
				state.funcs[j]   = f;
				state.func_names = (char**)      realloc( state.func_names, sizeof(char*) * state.num_funcs );
				state.func_names[j] = (char*) malloc( strlen(kernel) + 1 );
				strcpy( state.func_names[j], kernel );
				break; 
			}
		}
	}

	if( f== NULL ) {
		fprintf(stderr, "Error running kernel [%s] : \n", kernel );
		error( "No kernel found" );
	}

	if( grid.z != 1 ) {
		printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z );
		error( "grid.z needs to be 1" );
	}

//printf("Running kernel [%s]\n", kernel );

	type = types[idx];
	while( type != SWAN_END ) {
		void *ptr = ptrs[idx];
		switch( type ) {
//			DEBLOCK( SWAN_uchar, uchar,  1 );
			DEBLOCK( SWAN_uchar2, uchar2,  2 );
			DEBLOCK( SWAN_uchar3, uchar3,  1 );
			DEBLOCK( SWAN_uchar4, uchar4,  4 );
			DEBLOCK( SWAN_char , int,  1 );
//			DEBLOCK( SWAN_char1 , char1,  1 );
			DEBLOCK( SWAN_char2 , char2,  2 );
			DEBLOCK( SWAN_char3 , char3,  1 );
			DEBLOCK( SWAN_char4 , char4,  4 );
			DEBLOCK( SWAN_int, int,  4 );
//			DEBLOCK( SWAN_int1, int1,  4 );
			DEBLOCK( SWAN_int2, int2,  8 );
			DEBLOCK( SWAN_int3, int3,  4 );
			DEBLOCK( SWAN_int4, int4,  16 );
//			DEBLOCK( SWAN_float, double,  4 );
//			DEBLOCK( SWAN_float1, float1,  4 );
			DEBLOCK( SWAN_float2, float2,  8 );
			DEBLOCK( SWAN_float3, float3,  4 );
			DEBLOCK( SWAN_float4, float4,  16 );

			DEBLOCK( SWAN_uint, uint,  4 );
			DEBLOCK( SWAN_uint2, uint2,  8 );
			DEBLOCK( SWAN_uint3, uint3,  4 );
			DEBLOCK( SWAN_uint4, uint4,  16 );
			DEBLOCK( SWAN_float, float,  4 );


//#define DEBLOCK(swan_type,type,OFFSET) 
#if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4
			case SWAN_PTR: 
				{
//printf("PTR as NATIVE\n");
				ALIGN_UP( offset, (sizeof(void*)));
				cuParamSetv( f, offset, ptr, sizeof(void*) );
				offset += sizeof(void*); }
			break;
#else
			case SWAN_PTR: 
				{
//printf("PTR as INT\n");
				ALIGN_UP( offset, (sizeof(int)));
				cuParamSetv( f, offset, ptr, sizeof(int) );
				offset += sizeof(int); }
			break;
#endif



			default:
        printf("%d\n", type );
				error("Parameter type not handled\n");


		}
		idx++;
		type = types[idx];
	}

//printf("Launching kernel [%s] [%X]  with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z );
//printf(" TOTAL OFFSET %d\n", offset );
	CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) );
#if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4
	cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd
#endif

	err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ;

	if( err != CUDA_SUCCESS ) {
		fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z );
	assert(0);
		exit(-99);
	}

//printf("Kernel completed\n" );
}
Exemple #16
0
	void Function::setBlockShape(int x, int y, int z) const
	{
		detail::error_check(cuFuncSetBlockShape(impl->func, x, y, z),
			"Can't set Cuda function block shape");
	}
Exemple #17
0
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	unsigned int *a_buf, *b_buf, *c_buf;
	unsigned long long int a_phys, b_phys, c_phys;
	unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int));
	int block_x, block_y, grid_x, grid_y;
	char fname[256];
	int ret = 0;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	struct timeval tv_mem_alloc_start;
	struct timeval tv_data_init_start;
	float data_init;
	struct timeval tv_conf_kern_start;
	struct timeval tv_close_start;
	float mem_alloc;
	float exec;
	float init_gpu;
	float configure_kernel;
	float close_gpu;
	float data_read;

	unsigned int dummy_b, dummy_c;
		

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/madd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	gettimeofday(&tv_mem_alloc_start, NULL);

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (a) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&a_phys, (void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (a) failed\n");
		return -1;
	}
	/*printf("a[]: Physical Address 0x%llx\n", a_phys);*/

	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (b) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&b_phys, (void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (b) failed\n");
		return -1;
	}
	/*printf("b[]: Physical Address 0x%llx\n", b_phys);*/

	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (c) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&c_phys, (void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (c) failed\n");
		return -1;
	}
	/*printf("c[]: Physical Address 0x%llx\n", c_phys);*/

	gettimeofday(&tv_data_init_start, NULL);

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			a_buf[idx++] = i;
		}
	}
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {
			b_buf[idx++] = i;
		}
	}

	gettimeofday(&tv_h2d_start, NULL);
	gettimeofday(&tv_h2d_end, NULL);


	gettimeofday(&tv_conf_kern_start, NULL);

	/* set kernel parameters */
	res = cuParamSeti(function, 0, a_dev);	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 4, a_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 8, b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 12, b_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 16, c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 20, c_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 24, n);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSetSize(function, 28);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);


	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */


	memcpy(c, c_buf, n*n*sizeof(unsigned int));

	gettimeofday(&tv_d2h_end, NULL);

	/* Read back */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			dummy_c = c[idx++];
		}
	}



	gettimeofday(&tv_close_start, NULL);

	res = cuMemUnmap((void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);




	tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
	init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv);
	mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_start, &tv_data_init_start, &tv);
	data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_start, &tv_conf_kern_start, &tv);
	configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_close_start, &tv_d2h_end, &tv);
	data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_close_start, &tv);
	close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("Init: %f\n", init_gpu);
	printf("MemAlloc: %f\n", mem_alloc);
	printf("DataInit: %f\n", data_init);
	printf("HtoD: %f\n", h2d);
	printf("KernConf: %f\n", configure_kernel);
	printf("Exec: %f\n", exec);
	printf("DtoH: %f\n", d2h);
	printf("DataRead: %f\n", data_read);
	printf("Close: %f\n", close_gpu);
	printf("Total: %f\n", total);


	return ret;
}
Exemple #18
0
//host driver
//void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) {
void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){
    //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);
    
    
    unsigned int maxthreads = 65000;
    // Set threads per block here.
        unsigned int blocksdim1d = 256;
    dim3 threads(blocksdim1d, 1, 1);
    int nstreams = iDivUp(N, maxthreads*blocksdim1d);
    CUresult err = CUDA_SUCCESS;
    for (int str = 0; str < nstreams; str++) {
        int offset = str * maxthreads * blocksdim1d;
        int size = 0;
        if (str == (nstreams - 1))
            size = N - str * maxthreads * blocksdim1d;
        else
            size = maxthreads * blocksdim1d;
        
        
        int gridx = iDivUp(size, blocksdim1d); // number of x blocks
        
        // setup execution parameters
        
        if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) {
            mexErrMsgTxt("Error in cuFuncSetBlockShape");
        }
        
        if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
            mexErrMsgTxt("Error in cuFuncSetSharedSize");
        }
        
        //mexPrintf("block shape ok\n");
        
        // add parameters
        int poffset = 0;
        
        // CUDA kernels interface
        // N: number of elements
        for (int p=0;p<nrhs;p++) {
            ALIGN_UP(poffset, prhs[p].align);
            if (CUDA_SUCCESS
                    != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
                mexErrMsgTxt("Error in cuParamSetv");
            }
            poffset += prhs[p].psize;
        }
        
        ALIGN_UP(poffset, __alignof(size));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(size);
        
        ALIGN_UP(poffset, __alignof(offset));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(offset);
        
        ALIGN_UP(poffset, __alignof(imx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imx);
        
        ALIGN_UP(poffset, __alignof(imy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imy);
        
        ALIGN_UP(poffset, __alignof(outx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outx);
        
        ALIGN_UP(poffset, __alignof(outy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outy);
        
        ALIGN_UP(poffset, __alignof(poolx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(poolx);
        
        ALIGN_UP(poffset, __alignof(pooly));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(pooly);
        
//   if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
//     mexErrMsgTxt("Error in cuParamSeti");
//   }
//   poffset += sizeof(shmemX);
        
        if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
            mexErrMsgTxt("Error in cuParamSetSize");
        }
        
        err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
        if (CUDA_SUCCESS != err) {
            mexErrMsgTxt("Error running kernel");
        }
        
    }
}
int main( int argc, char** argv)
{
	uint num_threads;
	uint num_blocks, block_size;
	uint length;
	uint nBytes;
	int *list;
	int status, verbose, c, i, j, logBlocks;
	int read_stdin;
	struct timeval start_time, end_time;
	unsigned long total_time;
	CUdevice hDevice;
	CUcontext hContext;
	CUmodule hModule;
	CUfunction bitonicBlockFn;
	CUfunction mergeBlocksFn;
	CUdeviceptr pDeviceArrayA;
	CUdeviceptr pDeviceArrayB;

	status = SUCCESS;
	verbose = 0;
	read_stdin = FALSE;
	length = 0;

	while ((c = getopt (argc, argv, "dip:vO")) != -1) {
		switch (c) {
		case 'd':
			verbose |= GROSS_DEBUG;
			break;
		case 'i':
			read_stdin = TRUE;
		case 'O':
			verbose |= OUTPUT;
			break;
		case 'p':
			length = 1 << atoi(optarg);
			break;
		case 'v':
			verbose |= DEBUG;
			break;
		case '?':
		default:
			print_usage();
			return FAILURE;
		}
	}

	if ( read_stdin == TRUE ) {
		/* Read sequence of integers from stdin */
		list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) );
		length = readIntegers(list, INIT_INPUT_SIZE);
	} else if ( length > 0 ) {
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	} else if (optind >= argc) { /* No size was given */
		print_usage();
		return FAILURE;
	} else {
		/* Generate our own integers */
		length = atoi(argv[optind]);
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	}

	/*
	* Phase 1:
	* 	There will be one thread for each element to be sorted. Each
	*	block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements.
	*/

	/* Initialize sizes */
	num_threads = _min(length, MAX_THREADS_PER_BLOCK );
	num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1;
	nBytes = length * sizeof(int);

	if (verbose & DEBUG) printf("Initializing GPU.\n");
	
	/* Start timing */
	gettimeofday(&start_time, NULL);

	/* Initialize GPU */
	cutilDrvSafeCall( cuInit(0) 					);
	cutilDrvSafeCall( cuDeviceGet(&hDevice, 0)			); 
	cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) 		);
	cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) 		);
	cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) );

	/* Allocate memory on the device */
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes)		);
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes)		);
	cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) 	);
	cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1));
	cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA)	);
	cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4)		);
	
	/* Execute the kernel on the GPU */
	if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads);
	cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1)		);

	/*
	* Phase 2:
	* 	At this point each block is a sorted list. Now it's time to merge them.	
	*/

	/* TODO This should go away after development */
	if ( verbose & GROSS_DEBUG ) {
		cuMemcpyDtoH(list, pDeviceArrayA, nBytes);
		for (i=0; i<num_blocks; ++i) {
			printf("### Block %d:\n", i);
			for (j=0; j<num_threads; ++j) {
				printf("%d\n", list[i*num_threads + j]);
			}
		}
	}
	
	i=0;

	/* Do we need to merge blocks? */
	if ( num_blocks > 1 ) {

		/* There will be Log_2(num_blocks) merge steps. */
		logBlocks = 0;
		for (i=1; i<num_blocks; i *= 2)	++logBlocks;

		if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks);	

		block_size = num_threads; 	/* How big the blocks were in the last grid launch. */
		num_threads = num_blocks >> 1;  /* Start with blocks/2 threads */
		num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;

		cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) );
		cuParamSeti(mergeBlocksFn, 4, block_size);
		cuParamSetSize(mergeBlocksFn, 16);

		for (i=0; i < logBlocks; ++i) {
			cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1);
			cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */

			/* Merging uses a source array and destination array, the gpu has 2 arrays allocated
			 * so we swap which is the source and which is the destination for each iteration. */
			if ( i%2 == 0 ) {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB);
			} else {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA);
			}

			if ( verbose & DEBUG ) {
				printf("Launching block merge kernel with %d blocks and %d threads per block\n", 
									num_blocks, num_threads/num_blocks);
			}	
			cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) );

			num_threads = num_threads >> 1;
			num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;
		}
	}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char** argv)
{
    CUcontext cuContext;

    // initialize CUDA
    CUfunction pk = NULL;
    const char cubin_name [] = "pass_kernel.cubin";
    const char kernel_name [] = "pass_kernel";

    CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name));
    printf("initCuda-returned CUfunction:\n");

    // cuParamSetx, x=i f v
    // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment
    #define ALIGN_UP(offset, alignment)					\
        (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

    size_t offset = 0;

    // input integers
    // CU paramset i.
    for(int i = 0 ; i < NUM_ARG ; i++) 
    {
 	int align = __alignof(int);
	ALIGN_UP(offset, align);
	cuParamSeti(pk, offset, i);
	printf ("offset %d = %d\n", i, offset);
	offset += sizeof(int);
    }

    // return array for updated inputs
    int size_int = sizeof(int);

    int size_array = size_int * NUM_ARG;
    CUdeviceptr d_return_values;
    cuMemAlloc (&d_return_values, size_array);
    void* ptr = (void*)(size_t)d_return_values;
    int align = __alignof(ptr);
    ALIGN_UP(offset, align);
    cuParamSetv(pk, offset, &ptr, sizeof(ptr));
    printf("return values offset:%d\n", offset);
    offset += sizeof(ptr);

    CUdeviceptr d_return_N;
    cuMemAlloc(&d_return_N, size_int);
    void* ptrN = (void*)(size_t)d_return_N;
    int alignN = __alignof(ptrN);
    ALIGN_UP(offset, alignN);
    cuParamSetv(pk, offset, &ptrN, sizeof(ptr));
    printf("return int offset:%d\n", offset);
    offset += sizeof(ptrN);

    // Calling kernel
    int BLOCK_SIZE_X = NUM_ARG;
    int BLOCK_SIZE_Y = 1;
    int BLOCK_SIZE_Z = 1;
    int GRID_SIZE = 1;
    cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z));
 
    printf("paramsetsize:%d\n", offset);
    CU_SAFE_CALL(cuParamSetSize(pk, offset));
    CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE));

    int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int));
    CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array));
    CU_SAFE_CALL(cuMemFree(d_return_values));

    for(int i=0;i<NUM_ARG;i++)
        printf("%dth value = %d\n", i, h_return_values[i]);
    free(h_return_values);

    int* h_return_N = (int*)malloc(sizeof(int));
    CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int));
    CU_SAFE_CALL(cuMemFree(d_return_N));

    printf("%d sizeof array\n", *h_return_N);

    if(cuContext !=NULL) cuCtxDetach(cuContext);
}
Exemple #21
0
int main(int argc, char *argv[])
{
	srand(time(NULL));
	for(int k=0;k<4;k++)
	{
		int n = 30*(k+1);
		float x = ((float) rand()) / (float) RAND_MAX;
		float *a = new float[n+1];
		float resultGPU;

		for(int i = 0; i < n + 1; i++)
			a[i] = i * 0.5*((float) rand()) / (float) RAND_MAX;


		int blocks = (n + 1) / BLK_SZ;
		if((n + 1) % BLK_SZ)
			blocks++;

		CUdevice	hDevice;
		CUcontext	hContext;
		CUmodule	hModule;
		CUfunction	hFunction;

		CALL( cuInit(0) );
		CALL( cuDeviceGet(&hDevice, 0) ); 	
		CALL( cuCtxCreate(&hContext, 0, hDevice) );
		CALL( cuModuleLoad(&hModule, "kernel.cubin") );
		CALL( cuModuleGetFunction(&hFunction, hModule, "Polynomial") );


		//dane wejsciowe - kopiowanie
		CUdeviceptr DevA, DevResult;

		CALL( cuMemAlloc(&DevA, (n+1)*sizeof(float) ) );
		CALL( cuMemAlloc(&DevResult, sizeof(float) ) );

		CALL( cuMemcpyHtoD(DevA, a, (n+1)*sizeof(float)  ) );


		CALL( cuFuncSetBlockShape(hFunction, BLK_SZ, 1, 1) );


		//przekazanie parametrow do kernela
		int 	offset = 0;
		void   *ptr;

		ptr = (void*)(size_t)DevResult;
		ALIGN_UP(offset, __alignof(ptr));
		CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) );
		offset += sizeof(ptr);

		ptr = (void*)(size_t)DevA;
		ALIGN_UP(offset, __alignof(ptr));
		CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) );
		offset += sizeof(ptr);

		ALIGN_UP(offset, __alignof(float));
		CALL( cuParamSetf(hFunction, offset, x) );
		offset += sizeof(float);

		ALIGN_UP(offset, __alignof(int));
		CALL( cuParamSeti(hFunction, offset, n) );
		offset += sizeof(int);


		CALL( cuParamSetSize(hFunction, offset) );

		CALL( cuLaunchGrid(hFunction, blocks, 1) );


		//kopiowanie wyniku na hosta
		CALL( cuMemcpyDtoH((void *) &resultGPU, DevResult, sizeof(float) ) );


		//zwalnianie pamieci na urzadzeniu
		CALL( cuMemFree(DevA) );
		CALL( cuMemFree(DevResult) );


		//obliczenia na CPU
		float resultCPU = PolynomialCPU(a, x, n);


		std::cout << "GPU:\t" << resultGPU << std::endl;
		std::cout << "CPU:\t" << resultCPU << std::endl;
		std::cout << "roznica:\t" << fabs(resultGPU - resultCPU) << std::endl;
		
		delete [] a;
	}

	return 0;
}
Exemple #22
0
void load_and_test(CUmodule cuModule, char * test_name)
{
	try
	{
		CUfunction proc;
		test(cuModuleGetFunction(&proc, cuModule, test_name), "cuModuleGetFunction");

		int max = 1000;

		bool * h_R = (bool*)malloc(max * sizeof(bool));
		memset(h_R, 0, max * sizeof(bool));

		CUdeviceptr d_R;
		test(cuMemAlloc(&d_R, max * sizeof(bool)), "cuMemAlloc");
		test(cuMemcpyHtoD(d_R, h_R, max * sizeof(bool)), "cuMemcpyHtoD");

		CUdeviceptr d_N;
		int h_N = 0;
		test(cuMemAlloc(&d_N, sizeof(int)), "cuMemAlloc");

		test(cuMemcpyHtoD(d_N, &h_N, sizeof(int)), "cuMemcpyHtoD");

		int offset = 0;
		void* ptr;
		
		ptr = (void*)(size_t)d_R;
		ALIGN_UP(offset, __alignof(ptr));
		test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
		offset += sizeof(ptr);
		
		ptr = (void*)(size_t)d_N;
		ALIGN_UP(offset, __alignof(ptr));
		test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
		offset += sizeof(ptr);
		
		test(cuParamSetSize(proc, offset), "cuParamSetSize");

		int threadsPerBlock = 1;
		int blocksPerGrid = 1;

		test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape");

		test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid");

		test(cuMemcpyDtoH(h_R, d_R, max * sizeof(bool)), "cuMemcpyDtoH");

		test(cuMemcpyDtoH(&h_N, d_N, sizeof(int)), "cuMemcpyDtoH");

		test(cuMemFree(d_R), "cuMemFree");

		test(cuMemFree(d_N), "cuMemFree");

		bool failed = false;
		for (int i = 0; i < h_N; ++i)
		{
			if (h_R[i] == 0)
			{
				failed = true;
				std::cout << "\nTest " << i << " failed.\n";
				std::cout.flush();
			}
		}
		if (! failed)
			std::cout << test_name << " passed.\n";
		else {
			std::cout << test_name << " failed.\n";
		}
	}
	catch (...)
	{
		std::string s = test_name;
		s = s.append(" crashed.\n");
		test(1, s.c_str());
	}
}
Exemple #23
0
int main(int argc, char *argv[])
{
	argc--; argv++;

	// Instruction-level test of PTX assembly language and emulator.
	// This test should work natively and under emulation.  Many of the
	// instructions tested here stress many poorly documented features
	// of the PTX assembly language.  If the emulator passes these
	// tests, then it can surely pass code that is generated by the
	// nvcc compiler.
	
	test(cuInit(0), "cuInit");

	int deviceCount = 0;
	test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");

	int device = 0;
	if (argc)
		device = atoi(*argv);

	CUdevice cuDevice = 0;
	test(cuDeviceGet(&cuDevice, device), "cuDeviceGet");

	CUcontext cuContext;
	int xxx = cuCtxCreate(&cuContext, 0, cuDevice);

	CUmodule cuModule;
	test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad");

	// Do basic test.  No sense continuing if we cannot complete this
	// test.
	try
	{
		CUfunction proc;
		test(cuModuleGetFunction(&proc, cuModule, "InstBasic"), "cuModuleGetFunction");

		bool * h_R = (bool*)malloc(sizeof(bool));
		memset(h_R, 0, sizeof(bool));

		CUdeviceptr d_R;
		test(cuMemAlloc(&d_R, sizeof(bool)), "cuMemAlloc");

		test(cuMemcpyHtoD(d_R, h_R, sizeof(bool)), "cuMemcpyHtoD");

		int offset = 0;
		void* ptr;
	
		ptr = (void*)(size_t)d_R;
		ALIGN_UP(offset, __alignof(ptr));
		test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
		offset += sizeof(ptr);

		test(cuParamSetSize(proc, offset), "cuParamSetSize");

		int threadsPerBlock = 1;
		int blocksPerGrid = 1;

		test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape");

		test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid");

		test(cuMemcpyDtoH(h_R, d_R, sizeof(bool)), "cuMemcpyDtoH");

		test(cuMemFree(d_R), "cuMemFree");

		if (h_R[0] == 1)
			std::cout << "Basic test passed.\n";
		else {
			std::cout << "Basic test failed.\n";
			exit(1);
		}

	} catch (...)
	{
		test(1, "test crashed.");
	}

	// Do LD, ST, MOV test.
	load_and_test(cuModule, "InstLSMC");

	// Do ADD, SUB test.
	load_and_test(cuModule, "InstAddSub");

	return 0;
}
int gib_generate ( void *buffers, int buf_size, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  /* Do it all at once if the buffers are small enough */
#if !GIB_USE_MMAP
  /* This is too large to do at once in the GPU memory we have allocated.
   * Split it into several noncontiguous jobs. 
   */
  if (buf_size > gib_buf_size) {
    int rc = gib_generate_nc(buffers, buf_size, buf_size, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;
  
  unsigned char F[256*256];
  gib_galois_gen_F(F, c->m, c->n);
  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n)));
  
#if !GIB_USE_MMAP
  /* Copy the buffers to memory */
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, 
				(c->n)*buf_size));
#endif
  /* Configure and launch */
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block,
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)(gpu_c->buffers);
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size,
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1));

  /* Get the results back */
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size));
#else
  ERROR_CHECK_FAIL(cuCtxSynchronize());
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC; 
}
CUresult  cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch,
                                  CUdeviceptr d_dstARGB, size_t nDestPitch,
                                  uint32 width,          uint32 height,
                                  CUfunction fpFunc, CUstream streamID)
{
    CUresult status;
    // Each thread will output 2 pixels at a time.  The grid size width is half
    // as large because of this
    dim3 block(32,16,1);
    dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);

#if CUDA_VERSION >= 4000
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
    void *args[] = { &d_srcNV12, &nSourcePitch,
                     &d_dstARGB, &nDestPitch,
                     &width, &height
                   };

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
                            block.x, block.y, block.z,
                            0, streamID,
                            args, NULL);
#else
    // This is the older Driver API launch method from CUDA (V1.0 to V3.2)
    cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
    int offset = 0;

    // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers

    // device pointer for Source Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12,    sizeof(d_srcNV12)));
    offset += sizeof(d_srcNV12);

    // set the Source pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
    offset += sizeof(nSourcePitch);

    // device pointer for Destination Surface
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB,    sizeof(d_dstARGB)));
    offset += sizeof(d_dstARGB);

    //  set the Destination Pitch
    cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch,   sizeof(nDestPitch)));
    offset += sizeof(nDestPitch);

    // set the width of the image
    ALIGN_OFFSET(offset, __alignof(width));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width));
    offset += sizeof(width);

    // set the height of the image
    ALIGN_OFFSET(offset, __alignof(height));
    cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height));
    offset += sizeof(height);

    cutilDrvSafeCall(cuParamSetSize(fpFunc, offset));

    // Launching the kernel, we need to pass in the grid dimensions
    status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif

    if (CUDA_SUCCESS != status)
    {
        fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status);
        return status;
    }

    return status;
}
void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int offset=0;
	void *ptr=0;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			if(AllocateResources(numb,numt)==true)
			{
				// clear out any existing error
				CUresult err=CUDA_SUCCESS;

				int64 st=GetTimeMillis();

				for(int it=0; it<128*256*2 && err==CUDA_SUCCESS; it+=(numb*numt))
				{

					cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));

					offset=0;
					int loops=64;
					int bits=5;

					ptr=(void *)(size_t)m_devin;
					ALIGN_UP(offset, __alignof(ptr));
					cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
					offset+=sizeof(ptr);

					ptr=(void *)(size_t)m_devout;
					ALIGN_UP(offset, __alignof(ptr));
					cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
					offset+=sizeof(ptr);

					ALIGN_UP(offset, __alignof(loops));
					cuParamSeti(m_function,offset,loops);
					offset+=sizeof(loops);

					ALIGN_UP(offset, __alignof(bits));
					cuParamSeti(m_function,offset,bits);
					offset+=sizeof(bits);

					cuParamSetSize(m_function,offset);

					err=cuFuncSetBlockShape(m_function,numt,1,1);
					if(err!=CUDA_SUCCESS)
					{
						printf("cuFuncSetBlockShape error %d\n",err);
						continue;
					}

					err=cuLaunchGrid(m_function,numb,1);
					if(err!=CUDA_SUCCESS)
					{
						printf("cuLaunchGrid error %d\n",err);
						continue;
					}

					cuMemcpyDtoH(m_out,m_devout,numt*numb*sizeof(cuda_out));

					if(err!=CUDA_SUCCESS)
					{
						printf("CUDA error %d\n",err);
					}
				}

				int64 et=GetTimeMillis();

				printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

				if((et-st)<besttime && err==CUDA_SUCCESS)
				{
					bestb=numb;
					bestt=numt;
					besttime=et-st;
				}
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last,
		  gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if !GIB_USE_MMAP
  if (buf_size > gib_buf_size) {
    int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int i, j;
  int n = c->n;
  int m = c->m;
  unsigned char A[128*128], inv[128*128], modA[128*128];
  for (i = n; i < n+recover_last; i++)
    if (buf_ids[i] >= n) {
      fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n");
      return GIB_ERR;
    }

  gib_galois_gen_A(A, m+n, n);

  /* Modify the matrix to have the failed drives reflected */
  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++) 
      modA[i*n+j] = A[buf_ids[i]*n+j];

  gib_galois_gaussian_elim(modA, inv, n, n);

  /* Copy row buf_ids[i] into row i */
  for (i = n; i < n+recover_last; i++)
    for (j = 0; j < n; j++)
      modA[i*n+j] = inv[buf_ids[i]*n+j];

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;

  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n)));

#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size));
#endif
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)gpu_c->buffers;
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, 
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, 
			       sizeof(recover_last)));
  offset += sizeof(recover_last);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1));
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size));
#else
  cuCtxSynchronize();
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
Exemple #28
0
int main(void)
{
    // Initialize
    if (cuInit(0) != CUDA_SUCCESS)
	exit (0);
    // Get number of devices supporting CUDA
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
	printf("There is no device supporting CUDA.\n");
	exit (0);
    }
    // Get handle for device 0
    CUdevice cuDevice = 0;
    cuDeviceGet(&cuDevice, 0);
    // Create context
    CUcontext cuContext;
    cuCtxCreate(&cuContext, 0, cuDevice);
    // Create module from binary file
    CUmodule cuModule;
    cuModuleLoad(&cuModule, “VecAdd.ptx”);
    // Get function handle from module
    CUfunction vecAdd;
    cuModuleGetFunction(&vecAdd, cuModule, "VecAdd");
    // Allocate vectors in device memory
    size_t size = N * sizeof(float);
    CUdeviceptr d_A;


    cuMemAlloc(&d_A, size);
    CUdeviceptr d_B;
    cuMemAlloc(&d_B, size);
    CUdeviceptr d_C;
    cuMemAlloc(&d_C, size);
    // Copy vectors from host memory to device memory
    // h_A and h_B are input vectors stored in host memory
    cuMemcpyHtoD(d_A, h_A, size);
    cuMemcpyHtoD(d_B, h_B, size);
    // Invoke kernel
#define ALIGN_UP(offset, alignment)					\
    (offset) = ((offset) + (alignment) – 1) & ~((alignment) – 1)
    int offset = 0;
    ALIGN_UP(offset, __alignof(d_A));
    cuParamSetv(vecAdd, offset, &d_A, sizeof(d_A));
    offset += sizeof(d_A);
    ALIGN_UP(offset, __alignof(d_B));
    cuParamSetv(vecAdd, offset, &d_B, sizeof(d_B));
    offset += sizeof(d_B);
    ALIGN_UP(offset, __alignof(d_C));
    cuParamSetv(vecAdd, offset, &d_C, sizeof(d_C));
    offset += sizeof(d_C);
    cuParamSetSize(VecAdd, offset);
    int threadsPerBlock = 256;
    int blocksPerGrid =
	(N + threadsPerBlock – 1) / threadsPerBlock;
    cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);
    cuLaunchGrid(VecAdd, blocksPerGrid, 1);
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    cuMemcpyDtoH(h_C, d_C, size);
    // Free device memory
    cuMemFree(d_A);
    cuMemFree(d_B);
    cuMemFree(d_C);

    return (0);
}
Exemple #29
0
/*************************************************
 * HOST DRIVERS
 *************************************************/
void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) {


  unsigned int maxthreads = MAXTHREADS_STREAM;
  int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D);
  CUresult err = CUDA_SUCCESS;
  for (int str = 0; str < nstreams; str++) {
    int offset = str * maxthreads * BLOCK_DIM1D;
    int size = 0;
    if (str == (nstreams - 1))
      size = N - str * maxthreads * BLOCK_DIM1D;
    else
      size = maxthreads * BLOCK_DIM1D;


    int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks

    // setup execution parameters

    if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
      mexErrMsgTxt("Error in cuFuncSetBlockShape");
    }

    if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
      mexErrMsgTxt("Error in cuFuncSetSharedSize");
    }


    // add parameters
    int poffset = 0;

    // CUDA kernels interface
    // N: number of elements
    // offset: used for streams
    ALIGN_UP(poffset, __alignof(size));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(size);

    ALIGN_UP(poffset, __alignof(offset));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(offset);

    for (int p=0;p<nrhs;p++) {
      ALIGN_UP(poffset, prhs[p].align);
      if (CUDA_SUCCESS
          != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
        mexErrMsgTxt("Error in cuParamSetv");
      }
      poffset += prhs[p].psize;
    }

    if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
      mexErrMsgTxt("Error in cuParamSetSize");
    }

    err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
    if (CUDA_SUCCESS != err) {
      mexErrMsgTxt("Error running kernel");
    }
  }

}
Exemple #30
0
// Host code
int main()
{
	int N = 3;
	size_t size = N * sizeof(float);
	float* h_A = (float*)malloc(size);
	float* h_B = (float*)malloc(size);
	float* h_C = (float*)malloc(size);

	// Set up vectors.
	for (int i = 0; i < N; ++i)
	{
		h_A[i] = i * 1.0;
		h_B[i] = i * 1.0 + 1;
		h_C[i] = 0;
		printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]);
	}

	// Initialize
	if (cuInit(0) != CUDA_SUCCESS)
		exit (0);

	// Get number of devices supporting CUDA
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	if (deviceCount == 0)
	{
		printf("There is no device supporting CUDA.\n");
		exit (0);
	}

	// Get handle for device 0
	CUdevice cuDevice = 0;
	CUresult r1 = cuDeviceGet(&cuDevice, 0);
	// Create context
	CUcontext cuContext;
	cuCtxCreate(&cuContext, 0, cuDevice);
	// Create module from binary file
	CUmodule cuModule;
	CUresult r2 = cuModuleLoad(&cuModule, "VecAdd.ptx");
	// Get function handle from module
	CUfunction vecAdd;
	CUresult r3 = cuModuleGetFunction(&vecAdd, cuModule, "VecAdd");
	// Allocate vectors in device memory
	CUdeviceptr d_A;
	CUresult r4 = cuMemAlloc(&d_A, size);
	CUdeviceptr d_B;
	CUresult r5 = cuMemAlloc(&d_B, size);
	CUdeviceptr d_C;
	CUresult r6 = cuMemAlloc(&d_C, size);
	// Copy vectors from host memory to device memory
	// h_A and h_B are input vectors stored in host memory
	CUresult r7 = cuMemcpyHtoD(d_A, h_A, size);
	CUresult r8 = cuMemcpyHtoD(d_B, h_B, size);
	// Invoke kernel
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
	int offset = 0;
	void* ptr;
	ptr = (void*)(size_t)d_A;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r9 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(size_t)d_B;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r10 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(size_t)d_C;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r11 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(int)N;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r11a = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	CUresult r12 = cuParamSetSize(vecAdd, offset);
	int threadsPerBlock = 256;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	CUresult r13 = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);
	CUresult r14 = cuLaunchGrid(vecAdd, blocksPerGrid, 1);
	// Copy result from device memory to host memory
	// h_C contains the result in host memory
	CUresult r15 = cuMemcpyDtoH(h_C, d_C, size);
	for (int i = 0; i < N; ++i)
	{
		printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]);
	}

	// Free device memory
	cuMemFree(d_A);
	cuMemFree(d_B);
	cuMemFree(d_C);
}