extern "C" void sarafft_init() { printf( "Cuda is about to be initialized!\n" ); fflush ( stdout ); char *OMPI_COMM_WORLD_LOCAL_RANK = getenv( "OMPI_COMM_WORLD_LOCAL_RANK" ); if ( NULL == OMPI_COMM_WORLD_LOCAL_RANK ) { printf( "OMPI_COMM_WORLD_LOCAL_RANK not set!\n" ); fflush ( stdout ); exit( 80 ); } int localRank = atoi( OMPI_COMM_WORLD_LOCAL_RANK ); printf( "Local rank is %d\n", localRank ); fflush ( stdout ); if ( CUDA_SUCCESS != cuInit( 0 ) ) { printf( "cuInit failed!\n" ); fflush ( stdout ); exit( 81 ); } CUdevice device; if ( CUDA_SUCCESS != cuDeviceGet( &device, localRank ) ) { printf( "cuDeviceGet failed!\n" ); fflush ( stdout ); exit( 82 ); } if ( CUDA_SUCCESS != cuCtxCreate( &cuda_context, CU_CTX_SCHED_YIELD, device ) ) { printf( "cuCtxCreate failed!\n" ); fflush ( stdout ); exit( 83 ); } printf( "Cuda was initialized successfully!\n" ); fflush ( stdout ); }
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f) { CUresult res; CUdevice dev; res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return res; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxCreate(pctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return res; } res = cuModuleLoad(pmod, f); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); cuCtxDestroy(*pctx); return res; } return CUDA_SUCCESS; }
int main() { int ngpu; CUdevice cuDevice; CUcontext cuContext; cuInit(0); cuDeviceGetCount(&ngpu); //printf("ngpu = %d\n", ngpu); size_t *totals, *frees ; totals = (size_t *) calloc (ngpu, sizeof(size_t)); frees = (size_t *) calloc (ngpu, sizeof(size_t)); int tid; omp_set_num_threads(ngpu); #pragma omp parallel private(tid, cuDevice, cuContext) shared(frees, totals) { tid = omp_get_thread_num(); //printf("nthreads = %d, tid = %d\n", omp_get_num_threads(), tid); cuDeviceGet(&cuDevice, tid); cuCtxCreate(&cuContext, tid, cuDevice); cuMemGetInfo((size_t*)&frees[tid], (size_t*)&totals[tid]); } printf ("\ttotal\t\tfree\t\tused\n"); for(int i=0; i<ngpu; i++) { printf("GPU %d\t%lu\t%lu\t%lu\n", i, (size_t)totals[i], (size_t)frees[i], (size_t)totals[i]-(size_t)frees[i]); } return 0; }
void GPUInterface::GetDeviceDescription(int deviceNumber, char* deviceDescription) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::GetDeviceDescription\n"); #endif CUdevice tmpCudaDevice; SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber])); #if CUDA_VERSION >= 3020 size_t totalGlobalMemory = 0; #else unsigned int totalGlobalMemory = 0; #endif int clockSpeed = 0; int mpCount = 0; int major = 0; int minor = 0; SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice)); SAFE_CUDA(cuDeviceTotalMem(&totalGlobalMemory, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&clockSpeed, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&mpCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, tmpCudaDevice)); sprintf(deviceDescription, "Global memory (MB): %d | Clock speed (Ghz): %1.2f | Number of cores: %d", int(totalGlobalMemory / 1024.0 / 1024.0 + 0.5), clockSpeed / 1000000.0, nGpuArchCoresPerSM[major] * mpCount); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tLeaving GPUInterface::GetDeviceDescription\n"); #endif }
CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { background = background_; cuDevId = info.num; cuDevice = 0; cuContext = 0; /* intialize */ if(cuda_error(cuInit(0))) return; /* setup device and context */ if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) return; CUresult result; if(background) { result = cuCtxCreate(&cuContext, 0, cuDevice); } else { result = cuGLCtxCreate(&cuContext, 0, cuDevice); if(result != CUDA_SUCCESS) { result = cuCtxCreate(&cuContext, 0, cuDevice); background = true; } } if(cuda_error_(result, "cuCtxCreate")) return; cuda_pop_context(); }
main() { /* initialize CUDA */ CUresult res; res = cuInit(0); MY_CUDA_CHECK(res, "cuInit()"); /* check GPU is setted or not */ int device_num; res = cuDeviceGetCount(&device_num); MY_CUDA_CHECK(res, "cuDeviceGetCount()"); if (device_num == 0) { // no GPU is detected fprintf(stderr, "no CUDA capable GPU is detected...\n"); exit(1); } printf("%d GPUs are detected\n", device_num); for (int i=0; i<device_num; i++) { /* get device handle of GPU No.i */ CUdevice dev; res = cuDeviceGet(&dev, i); MY_CUDA_CHECK(res, "cuDeviceGet()"); /* search compute capability of GPU No.i */ int major=0, minor=0; res = cuDeviceComputeCapability(&major, &minor, dev); MY_CUDA_CHECK(res, "cuDeviceComputeCapability()"); printf("GPU[%d] : actual compute capability is : %d%d\n", i, major, minor); } }
void printout_devices( ) { int ndevices; cuDeviceGetCount( &ndevices ); for( int idevice = 0; idevice < ndevices; idevice++ ) { char name[200]; #if CUDA_VERSION > 3010 size_t totalMem; #else unsigned int totalMem; #endif int clock; CUdevice dev; cuDeviceGet( &dev, idevice ); cuDeviceGetName( name, sizeof(name), dev ); cuDeviceTotalMem( &totalMem, dev ); cuDeviceGetAttribute( &clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ); printf( "device %d: %s, %.1f MHz clock, %.1f MB memory\n", idevice, name, clock/1000.f, totalMem/1024.f/1024.f ); } }
int main(int argc, char* argv[]) { cuInit(0); int devs = 0; cuDeviceGetCount(&devs); assert(devs > 0); CUdevice dev; CUresult status; CUcontext ctx = 0; cuDeviceGet(&dev, 0); cuCtxCreate(&ctx, 0, dev); { size_t f = 0, t = 0; CUresult r = cuMemGetInfo(&f, &t); fprintf( stderr, "Do cuMemGetInfo: %d, %zu/%zu\n", r, f, t ); } __init("\n"); printf("\nPress any key to exit..."); char c; scanf("%c", &c); return 0; }
int init_cuda_context (void) { #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; int local_rank, dev_count; int dev_id = 0; char * str; if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } #endif return 0; }
int mcopy_gpu_init(struct device_info *device_info) { char fname[256]; CUresult res; /* printf("madd_gpu_init called.\n"); */ /* Initialization */ if ((res = cuInit(0)) != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }
std::pair< std::vector<context>, std::vector<command_queue> > queue_list(DevFilter &&filter, unsigned queue_flags = 0) { cuda_check( do_init() ); std::vector<context> ctx; std::vector<command_queue> queue; int ndev; cuda_check( cuDeviceGetCount(&ndev) ); for(int d = 0; d < ndev; ++d) { try { CUdevice dev; cuda_check( cuDeviceGet(&dev, d) ); if (!filter(dev)) continue; context c(dev); command_queue q(c, dev, queue_flags); ctx.push_back(c); queue.push_back(q); } catch(const error&) { } } return std::make_pair(ctx, queue); }
bool initCuda(CUcontext & cuContext) { // Initialize Cuda CUresult cerr; int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { fprintf(stderr, "Sorry, no CUDA device found"); return false; } int selectedDevice = 0; if (selectedDevice >= deviceCount) { fprintf(stderr, "Choose device ID between 0 and %d\n", deviceCount-1); return false; } // Initialize the CUDA device CUdevice cuDevice; cerr = cuDeviceGet(&cuDevice,selectedDevice); checkError(cerr); cerr = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST|CU_CTX_BLOCKING_SYNC, cuDevice); checkError(cerr); }
Object cuda_over_map(Object self, int nparts, int *argcv, Object *argv, int flags) { CUresult error; cuInit(0); int deviceCount = 0; error = cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; CUfunction cuFunc; error = cuDeviceGet(&cuDevice, 0); error = cuCtxCreate(&cuContext, 0, cuDevice); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_res; errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]]))); CUdeviceptr dps[argcv[0]]; void *args[argcv[0]+2]; int size = INT_MAX; for (int i=0; i<argcv[0]; i++) { struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i]; if (a->size < size) size = a->size; errcheck(cuMemAlloc(&dps[i], size * sizeof(float))); errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float))); args[i+1] = &dps[i]; } struct CudaFloatArray *r = (struct CudaFloatArray *)(alloc_CudaFloatArray(size)); int fsize = sizeof(float) * size; errcheck(cuMemAlloc(&d_res, fsize)); errcheck(cuMemcpyHtoD(d_res, &r->data, fsize)); args[0] = &d_res; args[argcv[0]+1] = &size; int threadsPerBlock = 256; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; char name[256]; strcpy(name, "block"); strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/")); for (int i=0; name[i] != 0; i++) if (name[i] == '.') { name[i] = 0; break; } errcheck(cuModuleGetFunction(&cuFunc, cuModule, name)); errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); errcheck(cuMemcpyDtoH(&r->data, d_res, fsize)); cuMemFree(d_res); for (int i=0; i<argcv[0]; i++) cuMemFree(dps[i]); return (Object)r; }
device::device() { cuInit(0); cuDeviceGet(&cu_device, 0); checkCudaError("device::device Init"); //cuCtxCreate(&cu_context, 0, cu_device); //checkCudaError("device::device Create context"); device_name = props.name; }
int get_suitable_block_num(int device, int *max_block_num, int *mp_num, int word_size, int thread_num, int large_size) { #ifdef TODO cudaDeviceProp dev; CUdevice cuDevice; int max_thread_dev; int max_block, max_block_mem, max_block_dev; int major, minor, ver; //int regs, max_block_regs; ccudaGetDeviceProperties(&dev, device); cuDeviceGet(&cuDevice, device); cuDeviceComputeCapability(&major, &minor, cuDevice); //cudaFuncGetAttributes() #if 0 if (word_size == 4) { regs = 14; } else { regs = 16; } max_block_regs = dev.regsPerBlock / (regs * thread_num); #endif max_block_mem = dev.sharedMemPerBlock / (large_size * word_size + 16); if (major == 9999 && minor == 9999) { return -1; } ver = major * 100 + minor; if (ver <= 101) { max_thread_dev = 768; } else if (ver <= 103) { max_thread_dev = 1024; } else if (ver <= 200) { max_thread_dev = 1536; } else { max_thread_dev = 1536; } max_block_dev = max_thread_dev / thread_num; if (max_block_mem < max_block_dev) { max_block = max_block_mem; } else { max_block = max_block_dev; } #if 0 if (max_block_regs < max_block) { max_block = max_block_regs; } #endif *max_block_num = max_block; *mp_num = dev.multiProcessorCount; return max_block * dev.multiProcessorCount; #endif return 0; }
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) { CUdevice tmpCudaDevice; SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber])); int major = 0; int minor = 0; SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice)); return (major >= 2 || (major >= 1 && minor >= 3)); }
int init_accel (void) { #if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_) char * str; int local_rank, dev_count; int dev_id = 0; #endif #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case managed: case cuda: if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: if ((str = getenv("LOCAL_RANK")) != NULL) { dev_count = acc_get_num_devices(acc_device_not_host); local_rank = atoi(str); dev_id = local_rank % dev_count; } acc_set_device_num (dev_id, acc_device_not_host); break; #endif default: fprintf(stderr, "Invalid device type, should be cuda or openacc\n"); return 1; } return 0; }
static void *cuda_init(int ord, int flags, int *ret) { CUdevice dev; cuda_context *res; static int init_done = 0; if (ord == -2) { CUcontext ctx; /* Grab the ambient context */ err = cuCtxGetCurrent(&ctx); CHKFAIL(NULL); /* If somebody made a context, then the api is initialized */ init_done = 1; res = cuda_make_ctx(ctx, DONTFREE); if (res == NULL) { FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; return res; } if (!init_done) { err = cuInit(0); CHKFAIL(NULL); init_done = 1; } if (ord == -1) { int i, c; err = cuDeviceGetCount(&c); CHKFAIL(NULL); for (i = 0; i < c; i++) { err = cuDeviceGet(&dev, i); CHKFAIL(NULL); res = do_init(dev, flags, NULL); if (res != NULL) return res; } FAIL(NULL, GA_NODEV_ERROR); } else { err = cuDeviceGet(&dev, ord); CHKFAIL(NULL); return do_init(dev, flags, ret); } }
void getBestDevice(){ int num_devices; int status; int i; CUdevice temp_device; int curr_multiprocessors; int max_multiprocessors = -1; int max_i = -1; status = cuDeviceGetCount(&num_devices); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetCount\n"); } for(i = 0; i < num_devices; ++i){ status = cuDeviceGet(&temp_device, i); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGet\n"); } status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT\n"); } if(curr_multiprocessors > max_multiprocessors) { max_multiprocessors = curr_multiprocessors; max_i = i; } } status = cuDeviceGet(&cuDevice, max_i); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetName\n"); } status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X\n"); } numMultiProcessors = max_multiprocessors; }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
/*===========================================================================*/ bool Device::create( const int ordinal ) { KVS_CU_CALL( cuDeviceGet( &m_handler, ordinal ) ); if ( kvs::cuda::DriverAPI::HasError() ) return false; KVS_CU_CALL( cuDeviceGetProperties( &m_property, m_handler ) ); if ( kvs::cuda::DriverAPI::HasError() ) return false; return true; }
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) { CUdevice tmpCudaDevice; SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber])); int major = 0; int minor = 0; SAFE_CUDA(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, tmpCudaDevice)); return (major >= 2 || (major >= 1 && minor >= 3)); }
Handle<Value> CudaDevice::New(const Arguments& args) { HandleScope scope; int ordinal = args[0]->IntegerValue(); CudaDevice *cu = new CudaDevice(); cuDeviceGet(&(cu->m_device), ordinal); cu->Wrap(args.This()); return args.This(); }
/* * Initializaiton in order to use kernel program */ void init_cuda(void){ thread_num = (N <= 16) ? N : 16 ; block_num = N / (thread_num*thread_num); if(N % (thread_num*thread_num) != 0) block_num++; res = cuInit(0); if(res != CUDA_SUCCESS){ printf("cuInit failed: res = %s\n", conv(res)); exit(1); } res = cuDeviceGet(&dev, 0); if(res != CUDA_SUCCESS){ printf("cuDeviceGet failed: res = %s\n", conv(res)); exit(1); } res = cuCtxCreate(&ctx, 0, dev); if(res != CUDA_SUCCESS){ printf("cuCtxCreate failed: res = %s\n", conv(res)); exit(1); } res = cuModuleLoad(&module, "./cuda_main.cubin"); if(res != CUDA_SUCCESS){ printf("cuModuleLoad() failed: res = %s\n", conv(res)); exit(1); } res = cuModuleGetFunction(&function, module, "cuda_main"); if(res != CUDA_SUCCESS){ printf("cuModuleGetFunction() failed: res = %s\n", conv(res)); exit(1); } /* * preparation for launch kernel */ res = cuFuncSetSharedSize(function, 0x40); /* just random */ if(res != CUDA_SUCCESS){ printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res)); exit(1); } res = cuFuncSetBlockShape(function, thread_num, thread_num, 1); if(res != CUDA_SUCCESS){ printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res)); exit(1); } }
void getBestDevice(JNIEnv *env){ int num_devices; int status; int i; CUdevice temp_device; int curr_multiprocessors; int max_multiprocessors = -1; int max_i = -1; status = cuDeviceGetCount(&num_devices); CHECK_STATUS(env,"error in cuDeviceGetCount",status) if(num_devices == 0) throw_cuda_errror_exception(env,"0 Cuda Devices were found",0); for(i = 0; i < num_devices; ++i){ status = cuDeviceGet(&temp_device, i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) if(curr_multiprocessors > max_multiprocessors) { max_multiprocessors = curr_multiprocessors; max_i = i; } } status = cuDeviceGet(&cuDevice, max_i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) numMultiProcessors = max_multiprocessors; }
Object cuda_deviceName(Object self, int nparts, int *argcv, Object *argv, int flags) { cuInit(0); int deviceCount = 0; cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; cuDeviceGet(&cuDevice, 0); char name[100]; cuDeviceGetName(name, 100, cuDevice); return alloc_String(name); }
// General initialization call to pick the best CUDA Device inline CUdevice findCudaDeviceDRV(bool verbose = false) { CUdevice cuDevice; int devID = 0; // Otherwise pick the device with highest Gflops/s char name[100]; devID = 0; CHECK_CUDA_CALL( cuDeviceGet(&cuDevice, devID) , "Couldn't get the device"); cuDeviceGetName(name, 100, cuDevice); if(verbose) { std::cout << "Using CUDA Device " << devID << ": " << name << std::endl; } cuDeviceGet(&cuDevice, devID); return cuDevice; }
device::device(int device_id) { cuInit(0); checkCudaError("device::device Init"); cuDeviceGet(&cu_device, device_id); checkCudaError("device::device Get device"); //cuCtxCreate(&cu_context, 0, cu_device); //checkCudaError("device::device Create context"); this->set_device(device_id); cudaGetDeviceProperties(&props, device_id); checkCudaError("device::device Get properties "); this->device_name = props.name; }
int main(int argc,char **argv){ unsigned total = 0; unsigned long zul; ctx marsh; if(argc != 3){ usage(argv[0]); exit(EXIT_FAILURE); } if(getzul(argv[1],&zul)){ usage(argv[0]); exit(EXIT_FAILURE); } if(getzul(argv[2],&marsh.s)){ usage(argv[0]); exit(EXIT_FAILURE); } if(cuInit(0)){ fprintf(stderr,"Couldn't initialize cuda\n"); exit(EXIT_FAILURE); } if(cuDeviceGet(&marsh.dev,zul)){ fprintf(stderr,"Couldn't get device %lu\n",zul); exit(EXIT_FAILURE); } while( (marsh.threadno = ++total) ){ pthread_t tid; int err; if( (err = pthread_create(&tid,NULL,thread,&marsh)) ){ fprintf(stderr,"Couldn't create thread %d (%s?)\n", total,strerror(err)); exit(EXIT_SUCCESS); } pthread_mutex_lock(&lock); while(!thrdone && threadsmaintain){ pthread_cond_wait(&cond,&lock); } thrdone = 0; if(!threadsmaintain){ pthread_mutex_unlock(&lock); fprintf(stderr,"Thread %d exited with an error.\n",total); break; } pthread_mutex_unlock(&lock); printf("Created thread %d\n",total); } exit(EXIT_SUCCESS); }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }