Example #1
0
//host driver
void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) {

  //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);

  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }


  //mexPrintf("block shape ok\n");

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(imgSize);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(numRegionsY);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(shmemX);

  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
  
}
Example #2
0
File: ov.c Project: CPFL/gtraffic
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
Example #3
0
SEXP R_auto_cuFuncSetSharedSize(SEXP r_hfunc, SEXP r_bytes)
{
    SEXP r_ans = R_NilValue;
    CUfunction hfunc = (CUfunction) getRReference(r_hfunc);
    unsigned int bytes = REAL(r_bytes)[0];
    
    CUresult ans;
    ans = cuFuncSetSharedSize(hfunc, bytes);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Example #4
0
/* Driver */
void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) {

  /* Each thread block computes a linear block of the target */
  int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh
  
  CUresult err = CUDA_SUCCESS;

  // setup execution parameters
  if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
    mexErrMsgTxt("Error in cuFuncSetBlockShape");
  }

  if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) {
    mexErrMsgTxt("Error in cuFuncSetSharedSize");
  }

  // add parameters
  int poffset = 0;

  // CUDA kernels interface
  // N: number of elements
  // offset: used for streams

  for (int p=0;p<nrhs;p++) {
    if (CUDA_SUCCESS
	!= cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
      mexErrMsgTxt("Error in cuParamSetv");
    }
    poffset += prhs[p].psize;
  }

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(n);

  if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) {
    mexErrMsgTxt("Error in cuParamSeti");
  }
  poffset += sizeof(m);


  if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
    mexErrMsgTxt("Error in cuParamSetSize");
  }

  err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
  if (CUDA_SUCCESS != err) {
    mexErrMsgTxt("Error running kernel");
  }
}
Example #5
0
/**
 * Invokes the kernel @f on a @gridDimX x @gridDimY x @gridDimZ grid of blocks. 
 * Each block contains @blockDimX x @blockDimY x @blockDimZ threads.
 * @sharedMemBytes sets the amount of dynamic shared memory that will be 
 * available to each thread block.
 *
 * cuLaunchKernel() can optionally be associated to a stream by passing a 
 * non-zero hStream argument.
 *
 * Kernel parameters to @f can be specified in one of two ways:
 *
 * 1) Kernel parameters can be specified via kernelParams. If f has N 
 * parameters, then kernelParams needs to be an array of N pointers. Each of 
 * kernelParams[0] through kernelParams[N-1] must point to a region of memory 
 * from which the actual kernel parameter will be copied. The number of kernel 
 * parameters and their offsets and sizes do not need to be specified as that 
 * information is retrieved directly from the kernel's image.
 *
 * 2) Kernel parameters can also be packaged by the application into a single 
 * buffer that is passed in via the extra parameter. This places the burden on
 * the application of knowing each kernel parameter's size and alignment/
 * padding within the buffer. Here is an example of using the extra parameter 
 * in this manner:
 *
 *  size_t argBufferSize;
 *  char argBuffer[256];
 *
 *  // populate argBuffer and argBufferSize
 *
 *  void *config[] = {
 *      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
 *      CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
 *      CU_LAUNCH_PARAM_END
 *  };
 *  status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
 *
 * The extra parameter exists to allow cuLaunchKernel to take additional less 
 * commonly used arguments. extra specifies a list of names of extra settings 
 * and their corresponding values. Each extra setting name is immediately 
 * followed by the corresponding value. The list must be terminated with 
 * either NULL or CU_LAUNCH_PARAM_END.
 *
 *  CU_LAUNCH_PARAM_END, which indicates the end of the extra array;
 *  CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next value in 
 *  extra will be a pointer to a buffer containing all the kernel parameters 
 *  for launching kernel f;
 *  CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next value in extra
 *  will be a pointer to a size_t containing the size of the buffer specified 
 *  with CU_LAUNCH_PARAM_BUFFER_POINTER;
 *
 * The error CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters 
 * are specified with both kernelParams and extra (i.e. both kernelParams and 
 * extra are non-NULL).
 *
 * Calling cuLaunchKernel() sets persistent function state that is the same as 
 * function state set through the following deprecated APIs:
 *
 * cuFuncSetBlockShape() cuFuncSetSharedSize() cuParamSetSize() cuParamSeti() 
 * cuParamSetf() cuParamSetv()
 *
 * When the kernel @f is launched via cuLaunchKernel(), the previous block 
 * shape, shared size and parameter info associated with @f is overwritten.
 *
 * Note that to use cuLaunchKernel(), the kernel @f must either have been 
 * compiled with toolchain version 3.2 or later so that it will contain kernel 
 * parameter information, or have no kernel parameters. If either of these 
 * conditions is not met, then cuLaunchKernel() will return 
 * CUDA_ERROR_INVALID_IMAGE.
 *
 * Parameters:
 * f - Kernel to launch
 * gridDimX	- Width of grid in blocks
 * gridDimY - Height of grid in blocks
 * gridDimZ - Depth of grid in blocks
 * blockDimX - X dimension of each thread block
 * blockDimY - Y dimension of each thread block
 * blockDimZ - Z dimension of each thread block
 * sharedMemBytes - Dynamic shared-memory size per thread block in bytes
 * hStream - Stream identifier
 * kernelParams - Array of pointers to kernel parameters
 * extra - Extra options
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_HANDLE, 
 * CUDA_ERROR_INVALID_IMAGE, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_LAUNCH_FAILED,
 * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, CUDA_ERROR_LAUNCH_TIMEOUT, 
 * CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, 
 * CUDA_ERROR_SHARED_OBJECT_INIT_FAILED 
 */
CUresult cuLaunchKernel
(CUfunction f, 
 unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
 unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
 unsigned int sharedMemBytes, CUstream hStream, 
 void **kernelParams, void **extra)
{
	struct gdev_cuda_raw_func *rf;
	CUresult res;
	int i;

	if (hStream) {
		GDEV_PRINT("cuLaunchKernel: Stream is not supported.\n");
		return CUDA_ERROR_INVALID_HANDLE;
	}

	if (extra) {
		GDEV_PRINT("cuLaunchKernel: Extra Parameters are not supported.\n");
		return CUDA_ERROR_INVALID_HANDLE;
	}

	res = cuFuncSetSharedSize(f, sharedMemBytes);
	if (res != CUDA_SUCCESS)
		return res;

	res = cuFuncSetBlockShape(f, blockDimX, blockDimY, blockDimZ);
	if (res != CUDA_SUCCESS)
		return res;

	rf = &f->raw_func;
	for (i = 0; i < rf->param_count; i++) {
		void *p = kernelParams[i];
		int offset = rf->param_info[i].offset;
		uint32_t size = rf->param_info[i].size;
		cuParamSetv(f, offset, p, size);
	}

	res = cuParamSetSize(f, rf->param_size);
	if (res != CUDA_SUCCESS)
		return res;

	res = cuLaunchGrid(f, gridDimX, gridDimY);
	if (res != CUDA_SUCCESS)
		return res;

	return CUDA_SUCCESS;
}
Example #6
0
/*************************************************
 * HOST DRIVERS
 *************************************************/
void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) {


  unsigned int maxthreads = MAXTHREADS_STREAM;
  int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D);
  CUresult err = CUDA_SUCCESS;
  for (int str = 0; str < nstreams; str++) {
    int offset = str * maxthreads * BLOCK_DIM1D;
    int size = 0;
    if (str == (nstreams - 1))
      size = N - str * maxthreads * BLOCK_DIM1D;
    else
      size = maxthreads * BLOCK_DIM1D;


    int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks

    // setup execution parameters

    if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
      mexErrMsgTxt("Error in cuFuncSetBlockShape");
    }

    if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
      mexErrMsgTxt("Error in cuFuncSetSharedSize");
    }


    // add parameters
    int poffset = 0;

    // CUDA kernels interface
    // N: number of elements
    // offset: used for streams
    ALIGN_UP(poffset, __alignof(size));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(size);

    ALIGN_UP(poffset, __alignof(offset));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(offset);

    for (int p=0;p<nrhs;p++) {
      ALIGN_UP(poffset, prhs[p].align);
      if (CUDA_SUCCESS
          != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
        mexErrMsgTxt("Error in cuParamSetv");
      }
      poffset += prhs[p].psize;
    }

    if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
      mexErrMsgTxt("Error in cuParamSetSize");
    }

    err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
    if (CUDA_SUCCESS != err) {
      mexErrMsgTxt("Error running kernel");
    }
  }

}
Example #7
0
//host driver
//void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) {
void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){
    //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);
    
    
    unsigned int maxthreads = 65000;
    // Set threads per block here.
        unsigned int blocksdim1d = 256;
    dim3 threads(blocksdim1d, 1, 1);
    int nstreams = iDivUp(N, maxthreads*blocksdim1d);
    CUresult err = CUDA_SUCCESS;
    for (int str = 0; str < nstreams; str++) {
        int offset = str * maxthreads * blocksdim1d;
        int size = 0;
        if (str == (nstreams - 1))
            size = N - str * maxthreads * blocksdim1d;
        else
            size = maxthreads * blocksdim1d;
        
        
        int gridx = iDivUp(size, blocksdim1d); // number of x blocks
        
        // setup execution parameters
        
        if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) {
            mexErrMsgTxt("Error in cuFuncSetBlockShape");
        }
        
        if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
            mexErrMsgTxt("Error in cuFuncSetSharedSize");
        }
        
        //mexPrintf("block shape ok\n");
        
        // add parameters
        int poffset = 0;
        
        // CUDA kernels interface
        // N: number of elements
        for (int p=0;p<nrhs;p++) {
            ALIGN_UP(poffset, prhs[p].align);
            if (CUDA_SUCCESS
                    != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
                mexErrMsgTxt("Error in cuParamSetv");
            }
            poffset += prhs[p].psize;
        }
        
        ALIGN_UP(poffset, __alignof(size));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(size);
        
        ALIGN_UP(poffset, __alignof(offset));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(offset);
        
        ALIGN_UP(poffset, __alignof(imx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imx);
        
        ALIGN_UP(poffset, __alignof(imy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imy);
        
        ALIGN_UP(poffset, __alignof(outx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outx);
        
        ALIGN_UP(poffset, __alignof(outy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outy);
        
        ALIGN_UP(poffset, __alignof(poolx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(poolx);
        
        ALIGN_UP(poffset, __alignof(pooly));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(pooly);
        
//   if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
//     mexErrMsgTxt("Error in cuParamSeti");
//   }
//   poffset += sizeof(shmemX);
        
        if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
            mexErrMsgTxt("Error in cuParamSetSize");
        }
        
        err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
        if (CUDA_SUCCESS != err) {
            mexErrMsgTxt("Error running kernel");
        }
        
    }
}
Example #8
0
int cuda_test_fmadd(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmadd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
Example #9
0
void swanRunKernelAsync( const char *kernel,  block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types  ) {
	// find the kernel

	if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0

	CUfunction f = NULL;
	int i;
	int offset = 0;
	CUresult err;

	int type;
	int idx=0;
	try_init();
	for( i=0; i < state.num_funcs; i++ ) {
		if( !strcmp( state.func_names[i], kernel ) ) {
			f = state.funcs[i];
			break;
		}
	}

	if( f == NULL ) {
		for( i=0; i < state.num_mods; i++ ) {
			cuModuleGetFunction( &f, state.mods[i], kernel );
			if( f!= NULL ) { 
				// found a kernel. store it for future use
				int j = state.num_funcs;
				state.num_funcs++;
				state.funcs      = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs );
				state.funcs[j]   = f;
				state.func_names = (char**)      realloc( state.func_names, sizeof(char*) * state.num_funcs );
				state.func_names[j] = (char*) malloc( strlen(kernel) + 1 );
				strcpy( state.func_names[j], kernel );
				break; 
			}
		}
	}

	if( f== NULL ) {
		fprintf(stderr, "Error running kernel [%s] : \n", kernel );
		error( "No kernel found" );
	}

	if( grid.z != 1 ) {
		printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z );
		error( "grid.z needs to be 1" );
	}

//printf("Running kernel [%s]\n", kernel );

	type = types[idx];
	while( type != SWAN_END ) {
		void *ptr = ptrs[idx];
		switch( type ) {
//			DEBLOCK( SWAN_uchar, uchar,  1 );
			DEBLOCK( SWAN_uchar2, uchar2,  2 );
			DEBLOCK( SWAN_uchar3, uchar3,  1 );
			DEBLOCK( SWAN_uchar4, uchar4,  4 );
			DEBLOCK( SWAN_char , int,  1 );
//			DEBLOCK( SWAN_char1 , char1,  1 );
			DEBLOCK( SWAN_char2 , char2,  2 );
			DEBLOCK( SWAN_char3 , char3,  1 );
			DEBLOCK( SWAN_char4 , char4,  4 );
			DEBLOCK( SWAN_int, int,  4 );
//			DEBLOCK( SWAN_int1, int1,  4 );
			DEBLOCK( SWAN_int2, int2,  8 );
			DEBLOCK( SWAN_int3, int3,  4 );
			DEBLOCK( SWAN_int4, int4,  16 );
//			DEBLOCK( SWAN_float, double,  4 );
//			DEBLOCK( SWAN_float1, float1,  4 );
			DEBLOCK( SWAN_float2, float2,  8 );
			DEBLOCK( SWAN_float3, float3,  4 );
			DEBLOCK( SWAN_float4, float4,  16 );

			DEBLOCK( SWAN_uint, uint,  4 );
			DEBLOCK( SWAN_uint2, uint2,  8 );
			DEBLOCK( SWAN_uint3, uint3,  4 );
			DEBLOCK( SWAN_uint4, uint4,  16 );
			DEBLOCK( SWAN_float, float,  4 );


//#define DEBLOCK(swan_type,type,OFFSET) 
#if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4
			case SWAN_PTR: 
				{
//printf("PTR as NATIVE\n");
				ALIGN_UP( offset, (sizeof(void*)));
				cuParamSetv( f, offset, ptr, sizeof(void*) );
				offset += sizeof(void*); }
			break;
#else
			case SWAN_PTR: 
				{
//printf("PTR as INT\n");
				ALIGN_UP( offset, (sizeof(int)));
				cuParamSetv( f, offset, ptr, sizeof(int) );
				offset += sizeof(int); }
			break;
#endif



			default:
        printf("%d\n", type );
				error("Parameter type not handled\n");


		}
		idx++;
		type = types[idx];
	}

//printf("Launching kernel [%s] [%X]  with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z );
//printf(" TOTAL OFFSET %d\n", offset );
	CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) );
#if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4
	cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd
#endif

	err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ;

	if( err != CUDA_SUCCESS ) {
		fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z );
	assert(0);
		exit(-99);
	}

//printf("Kernel completed\n" );
}
Example #10
0
	void Function::setSharedSize(unsigned int bytes) const
	{
		detail::error_check(cuFuncSetSharedSize(impl->func, bytes),
			"Can't set Cuda function shared memory size");
	}