Esempio n. 1
0
CUresult CreateCuFunction(const char* name, CuModule* module, int3 blockShape, 
	FunctionPtr* ppFunction) {
	CUfunction func;
	CUresult result = cuModuleGetFunction(&func, module->Handle(), name);
	if(CUDA_SUCCESS != result) return result;

	FunctionPtr f(new CuFunction);

	CuFuncAttr& attr = f->_attributes;
	cuFuncGetAttribute(&attr.maxThreadsPerBlock,
		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func);
	cuFuncGetAttribute(&attr.sharedSizeBytes, 
		CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.constSizeBytes, 
		CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.localSizeBytes, 
		CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func);
	cuFuncGetAttribute(&attr.numRegs, 
		CU_FUNC_ATTRIBUTE_NUM_REGS, func);
	cuFuncGetAttribute(&attr.ptxVersion, 
		CU_FUNC_ATTRIBUTE_PTX_VERSION, func);
	cuFuncGetAttribute(&attr.binaryVersion, 
		CU_FUNC_ATTRIBUTE_BINARY_VERSION, func);
	f->_function = func;
	f->_module = module;
	f->_functionName = name;
	f->_blockShape = blockShape;
	ppFunction->swap(f);
	return CUDA_SUCCESS;
}
Esempio n. 2
0
SEXP
R_auto_cuFuncGetAttribute(SEXP r_attrib, SEXP r_hfunc)
{
    SEXP r_ans = R_NilValue;
    int pi;
    CUfunction_attribute attrib = (CUfunction_attribute) INTEGER(r_attrib)[0];
    CUfunction hfunc = (CUfunction) getRReference(r_hfunc);
    CUresult ans;
    ans = cuFuncGetAttribute(& pi,  attrib,  hfunc);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = ScalarInteger(pi) ;
    return(r_ans);
}
Esempio n. 3
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}
Esempio n. 4
0
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id,
                         void *res) {
  cuda_context *ctx = NULL;
  if (c != NULL) {
    ctx = (cuda_context *)c;
    ASSERT_CTX(ctx);
  } else if (buf != NULL) {
    ASSERT_BUF(buf);
    ctx = buf->ctx;
  } else if (k != NULL) {
    ASSERT_KER(k);
    ctx = k->ctx;
  }
  /* I know that 512 and 1024 are magic numbers.
     There is an indication in buffer.h, though. */
  if (prop_id < 512) {
    if (ctx == NULL)
      return GA_VALUE_ERROR;
  } else if (prop_id < 1024) {
    if (buf == NULL)
      return GA_VALUE_ERROR;
  } else {
    if (k == NULL)
      return GA_VALUE_ERROR;
  }

  switch (prop_id) {
    char *s;
    CUdevice id;
    int i;
    size_t sz;

  case GA_CTX_PROP_DEVNAME:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    /* 256 is what the CUDA API uses so it's good enough for me */
    s = malloc(256);
    if (s == NULL) {
      cuda_exit(ctx);
      return GA_MEMORY_ERROR;
    }
    ctx->err = cuDeviceGetName(s, 256, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((char **)res) = s;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_LMEMSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_NUMPROCS:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i,
                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((unsigned int *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXGSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_BLAS_OPS:
#ifdef WITH_CUDA_CUBLAS
    *((gpuarray_blas_ops **)res) = &cublas_ops;
    return GA_NO_ERROR;
#else
    *((void **)res) = NULL;
    return GA_DEVSUP_ERROR;
#endif

  case GA_CTX_PROP_BIN_ID:
    *((const char **)res) = ctx->bin_id;
    return GA_NO_ERROR;

  case GA_CTX_PROP_ERRBUF:
    *((gpudata **)res) = ctx->errbuf;
    return GA_NO_ERROR;

  case GA_CTX_PROP_TOTAL_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_CTX_PROP_FREE_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo((size_t *)res, &sz);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_BUFFER_PROP_REFCNT:
    *((unsigned int *)res) = buf->refcnt;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_SIZE:
    *((size_t *)res) = buf->sz;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_CTX:
  case GA_KERNEL_PROP_CTX:
    *((void **)res) = (void *)ctx;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuFuncGetAttribute(&i,
                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                  k->k);
    cuda_exit(ctx);
    if (ctx->err != CUDA_SUCCESS)
      return GA_IMPL_ERROR;
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_PREFLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    cuda_exit(ctx);
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_NUMARGS:
    *((unsigned int *)res) = k->argcount;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_TYPES:
    *((const int **)res) = k->types;
    return GA_NO_ERROR;

  default:
    return GA_INVALID_ERROR;
  }
}
Esempio n. 5
0
static void print_function_attrs(CUmodule cuda_module, const char *func_name)
{
	CUfunction	kernel;
	CUresult	rc;
	int			max_threads_per_block;
	int			shared_mem_sz;
	int			const_mem_sz;
	int			local_mem_sz;
	int			num_regs;
	int			ptx_version;
	int			binary_version;
	int			cache_mode_ca;
	int			min_grid_sz;
	int			max_block_sz;
	int			i;
	struct {
		CUfunction_attribute attr;
		int	   *vptr;
	} catalog[] = {
		{ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, &max_threads_per_block },
		{ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,     &shared_mem_sz },
		{ CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,      &const_mem_sz },
		{ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,      &local_mem_sz },
		{ CU_FUNC_ATTRIBUTE_NUM_REGS,              &num_regs },
		{ CU_FUNC_ATTRIBUTE_PTX_VERSION,           &ptx_version },
		{ CU_FUNC_ATTRIBUTE_BINARY_VERSION,        &binary_version },
		{ CU_FUNC_ATTRIBUTE_CACHE_MODE_CA,         &cache_mode_ca },
	};

	rc = cuModuleGetFunction(&kernel, cuda_module, func_name);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuModuleGetFunction");

	for (i=0; i < lengthof(catalog); i++)
	{
		rc = cuFuncGetAttribute(catalog[i].vptr,
								catalog[i].attr,
								kernel);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuFuncGetAttribute");
	}

	rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz,
										  &max_block_sz,
										  kernel,
										  cb_occupancy_shmem_size,
										  dynamic_shmem_per_block,
										  1024 * 1024);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuOccupancyMaxPotentialBlockSize");

	printf("Kernel Function:    %s\n"
		   "  Max threads per block:    %d\n"
		   "  Shared memory usage:      %d\n"
		   "  Constant memory usage:    %d\n"
		   "  Local memory usage:       %d\n"
		   "  Number of registers:      %d\n"
		   "  PTX version:              %d\n"
		   "  Binary version:           %d\n"
		   "  Global memory caching:    %s\n"
		   "  Max potential block size: %u\n"
		   "  (shmem usage: %ld/thread + %ld/block)\n",
		   func_name,
		   max_threads_per_block,
		   shared_mem_sz,
		   const_mem_sz,
		   local_mem_sz,
		   num_regs,
		   ptx_version,
		   binary_version,
		   cache_mode_ca ? "enabled" : "disabled",
		   max_block_sz,
		   dynamic_shmem_per_thread,
		   dynamic_shmem_per_block);
}
Esempio n. 6
0
 size_t shared_size_bytes() const {
     int n;
     cuda_check( cuFuncGetAttribute(&n, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, K) );
     return n;
 }
Esempio n. 7
0
 /// The maximum number of threads per block, beyond which a launch of the kernel would fail.
 size_t max_threads_per_block(const command_queue&) const {
     int n;
     cuda_check( cuFuncGetAttribute(&n, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, K) );
     return n;
 }