CUresult CreateCuFunction(const char* name, CuModule* module, int3 blockShape, FunctionPtr* ppFunction) { CUfunction func; CUresult result = cuModuleGetFunction(&func, module->Handle(), name); if(CUDA_SUCCESS != result) return result; FunctionPtr f(new CuFunction); CuFuncAttr& attr = f->_attributes; cuFuncGetAttribute(&attr.maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func); cuFuncGetAttribute(&attr.sharedSizeBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func); cuFuncGetAttribute(&attr.constSizeBytes, CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, func); cuFuncGetAttribute(&attr.localSizeBytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func); cuFuncGetAttribute(&attr.numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, func); cuFuncGetAttribute(&attr.ptxVersion, CU_FUNC_ATTRIBUTE_PTX_VERSION, func); cuFuncGetAttribute(&attr.binaryVersion, CU_FUNC_ATTRIBUTE_BINARY_VERSION, func); f->_function = func; f->_module = module; f->_functionName = name; f->_blockShape = blockShape; ppFunction->swap(f); return CUDA_SUCCESS; }
SEXP R_auto_cuFuncGetAttribute(SEXP r_attrib, SEXP r_hfunc) { SEXP r_ans = R_NilValue; int pi; CUfunction_attribute attrib = (CUfunction_attribute) INTEGER(r_attrib)[0]; CUfunction hfunc = (CUfunction) getRReference(r_hfunc); CUresult ans; ans = cuFuncGetAttribute(& pi, attrib, hfunc); if(ans) return(R_cudaErrorInfo(ans)); r_ans = ScalarInteger(pi) ; return(r_ans); }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } /* I know that 512 and 1024 are magic numbers. There is an indication in buffer.h, though. */ if (prop_id < 512) { if (ctx == NULL) return GA_VALUE_ERROR; } else if (prop_id < 1024) { if (buf == NULL) return GA_VALUE_ERROR; } else { if (k == NULL) return GA_VALUE_ERROR; } switch (prop_id) { char *s; CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } /* 256 is what the CUDA API uses so it's good enough for me */ s = malloc(256); if (s == NULL) { cuda_exit(ctx); return GA_MEMORY_ERROR; } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((unsigned int *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: #ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; #endif case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo(&sz, (size_t *)res); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo((size_t *)res, &sz); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((void **)res) = (void *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return GA_INVALID_ERROR; } }
static void print_function_attrs(CUmodule cuda_module, const char *func_name) { CUfunction kernel; CUresult rc; int max_threads_per_block; int shared_mem_sz; int const_mem_sz; int local_mem_sz; int num_regs; int ptx_version; int binary_version; int cache_mode_ca; int min_grid_sz; int max_block_sz; int i; struct { CUfunction_attribute attr; int *vptr; } catalog[] = { { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, &max_threads_per_block }, { CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, &shared_mem_sz }, { CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES, &const_mem_sz }, { CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, &local_mem_sz }, { CU_FUNC_ATTRIBUTE_NUM_REGS, &num_regs }, { CU_FUNC_ATTRIBUTE_PTX_VERSION, &ptx_version }, { CU_FUNC_ATTRIBUTE_BINARY_VERSION, &binary_version }, { CU_FUNC_ATTRIBUTE_CACHE_MODE_CA, &cache_mode_ca }, }; rc = cuModuleGetFunction(&kernel, cuda_module, func_name); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuModuleGetFunction"); for (i=0; i < lengthof(catalog); i++) { rc = cuFuncGetAttribute(catalog[i].vptr, catalog[i].attr, kernel); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuFuncGetAttribute"); } rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz, &max_block_sz, kernel, cb_occupancy_shmem_size, dynamic_shmem_per_block, 1024 * 1024); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuOccupancyMaxPotentialBlockSize"); printf("Kernel Function: %s\n" " Max threads per block: %d\n" " Shared memory usage: %d\n" " Constant memory usage: %d\n" " Local memory usage: %d\n" " Number of registers: %d\n" " PTX version: %d\n" " Binary version: %d\n" " Global memory caching: %s\n" " Max potential block size: %u\n" " (shmem usage: %ld/thread + %ld/block)\n", func_name, max_threads_per_block, shared_mem_sz, const_mem_sz, local_mem_sz, num_regs, ptx_version, binary_version, cache_mode_ca ? "enabled" : "disabled", max_block_sz, dynamic_shmem_per_thread, dynamic_shmem_per_block); }
size_t shared_size_bytes() const { int n; cuda_check( cuFuncGetAttribute(&n, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, K) ); return n; }
/// The maximum number of threads per block, beyond which a launch of the kernel would fail. size_t max_threads_per_block(const command_queue&) const { int n; cuda_check( cuFuncGetAttribute(&n, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, K) ); return n; }