void GPUInterface::GetDeviceDescription(int deviceNumber, char* deviceDescription) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::GetDeviceDescription\n"); #endif CUdevice tmpCudaDevice; SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber])); #if CUDA_VERSION >= 3020 size_t totalGlobalMemory = 0; #else unsigned int totalGlobalMemory = 0; #endif int clockSpeed = 0; int mpCount = 0; int major = 0; int minor = 0; SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice)); SAFE_CUDA(cuDeviceTotalMem(&totalGlobalMemory, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&clockSpeed, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&mpCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, tmpCudaDevice)); sprintf(deviceDescription, "Global memory (MB): %d | Clock speed (Ghz): %1.2f | Number of cores: %d", int(totalGlobalMemory / 1024.0 / 1024.0 + 0.5), clockSpeed / 1000000.0, nGpuArchCoresPerSM[major] * mpCount); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tLeaving GPUInterface::GetDeviceDescription\n"); #endif }
static int get_all_attributes(CUdevice c){ int attr,n; for(n = 0 ; n <= CU_DEVICE_ATTRIBUTE_ECC_ENABLED ; ++n){ CUresult cerr; if( (cerr = cuDeviceGetAttribute(&attr,n,c)) ){ fprintf(stderr,"Error acquiring device attr %d (%d)\n",n,cerr); return -1; } printf("Device attribute %d: %d\n",n,attr); } while(n <= CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID){ CUresult cerr; if( (cerr = cuDeviceGetAttribute(&attr,n,c)) ){ fprintf(stderr,"Error acquiring device attr %d (%d)\n",n,cerr); return -1; } printf("Device attribute %d: 0x%04x\n",n,attr); ++n; } return 0; }
/// Returns device compute capability as a tuple of major and minor version numbers. std::tuple<int, int> compute_capability() const { int major, minor; cuda_check( cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, d) ); cuda_check( cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, d) ); return std::make_tuple(major, minor); }
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) { CUdevice tmpCudaDevice; SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber])); int major = 0; int minor = 0; SAFE_CUDA(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, tmpCudaDevice)); SAFE_CUDA(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, tmpCudaDevice)); return (major >= 2 || (major >= 1 && minor >= 3)); }
CUDADevice::CUDADevice(const CUdevice device_number) : m_cuda_device_number(device_number) { char device_name[256]; cuDeviceGetName(device_name, 256, m_cuda_device_number); m_name = device_name; cuDeviceComputeCapability( &m_compute_capability.first, &m_compute_capability.second, m_cuda_device_number); cuDeviceGetAttribute(&m_compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, m_cuda_device_number); cuDeviceTotalMem(&m_total_mem, m_cuda_device_number); cuDeviceGetAttribute(&m_max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, m_cuda_device_number); cuDeviceGetAttribute(&m_max_block_dim_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, m_cuda_device_number); cuDeviceGetAttribute(&m_max_block_dim_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, m_cuda_device_number); cuDeviceGetAttribute(&m_max_block_dim_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, m_cuda_device_number); cuDeviceGetAttribute(&m_max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, m_cuda_device_number); cuDeviceGetAttribute(&m_max_grid_dim_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, m_cuda_device_number); cuDeviceGetAttribute(&m_max_grid_dim_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, m_cuda_device_number); cuDeviceGetAttribute(&m_max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, m_cuda_device_number); }
static CUresult get_cc(CUdevice dev, int *maj, int *min) { #if CUDA_VERSION < 6500 return cuDeviceComputeCapability(maj, min, dev); #else CUresult lerr; lerr = cuDeviceGetAttribute(maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); if (lerr != CUDA_SUCCESS) return lerr; return cuDeviceGetAttribute(min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); #endif }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
void printout_devices( ) { int ndevices; cuDeviceGetCount( &ndevices ); for( int idevice = 0; idevice < ndevices; idevice++ ) { char name[200]; #if CUDA_VERSION > 3010 size_t totalMem; #else unsigned int totalMem; #endif int clock; CUdevice dev; cuDeviceGet( &dev, idevice ); cuDeviceGetName( name, sizeof(name), dev ); cuDeviceTotalMem( &totalMem, dev ); cuDeviceGetAttribute( &clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ); printf( "device %d: %s, %.1f MHz clock, %.1f MB memory\n", idevice, name, clock/1000.f, totalMem/1024.f/1024.f ); } }
static void *do_init(CUdevice dev, int flags, int *ret) { cuda_context *res; CUcontext ctx; unsigned int fl = CU_CTX_SCHED_AUTO; int i; CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(NULL); if (i != 1) FAIL(NULL, GA_UNSUPPORTED_ERROR); err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) { CUresult error_result = cuDeviceGetAttribute( attribute, device_attribute, device ); if (error_result != CUDA_SUCCESS) { shrLog( "cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result) ); exit(0); } }
static int _gaspi_find_GPU_numa_node(int cudevice) { CUresult cres; int domain, bus, dev; char path[128]; FILE *sysfile = NULL; domain = 0; #ifdef CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID cres = cuDeviceGetAttribute(&domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice); if( CUDA_SUCCESS != cres ) { errno = ENOSYS; return -1; } #endif cres = cuDeviceGetAttribute(&bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice); if( CUDA_SUCCESS != cres ) { return -1; } cres = cuDeviceGetAttribute(&dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice); if( CUDA_SUCCESS != cres ) { return -1; } sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", domain, bus, dev); sysfile = fopen(path, "r"); if( !sysfile ) { gaspi_print_error("Failed to open %s.", path); return -1; } int numa_node = -1; fscanf (sysfile, "%1d", &numa_node); fclose(sysfile); return numa_node; }
void getBestDevice(){ int num_devices; int status; int i; CUdevice temp_device; int curr_multiprocessors; int max_multiprocessors = -1; int max_i = -1; status = cuDeviceGetCount(&num_devices); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetCount\n"); } for(i = 0; i < num_devices; ++i){ status = cuDeviceGet(&temp_device, i); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGet\n"); } status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT\n"); } if(curr_multiprocessors > max_multiprocessors) { max_multiprocessors = curr_multiprocessors; max_i = i; } } status = cuDeviceGet(&cuDevice, max_i); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetName\n"); } status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); if (CUDA_SUCCESS != status) { printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X\n"); } numMultiProcessors = max_multiprocessors; }
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) { CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device ); if( CUDA_SUCCESS != error) { fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__); exit(-1); } }
int device_t<CUDA>::simdWidth(){ if(simdWidth_) return simdWidth_; OCCA_EXTRACT_DATA(CUDA, Device); OCCA_CUDA_CHECK("Device: Get Warp Size", cuDeviceGetAttribute(&simdWidth_, CU_DEVICE_ATTRIBUTE_WARP_SIZE, data_.device) ); return simdWidth_; }
void getBestDevice(JNIEnv *env){ int num_devices; int status; int i; CUdevice temp_device; int curr_multiprocessors; int max_multiprocessors = -1; int max_i = -1; status = cuDeviceGetCount(&num_devices); CHECK_STATUS(env,"error in cuDeviceGetCount",status) if(num_devices == 0) throw_cuda_errror_exception(env,"0 Cuda Devices were found",0); for(i = 0; i < num_devices; ++i){ status = cuDeviceGet(&temp_device, i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) if(curr_multiprocessors > max_multiprocessors) { max_multiprocessors = curr_multiprocessors; max_i = i; } } status = cuDeviceGet(&cuDevice, max_i); CHECK_STATUS(env,"error in cuDeviceGet",status) status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); CHECK_STATUS(env,"error in cuDeviceGetAttribute",status) numMultiProcessors = max_multiprocessors; }
CUdevice CudaModule::selectDevice(void) { CUresult res = CUDA_SUCCESS; int numDevices; checkError("cuDeviceGetCount", cuDeviceGetCount(&numDevices)); CUdevice device = 0; S32 bestScore = FW_S32_MIN; for (int i=0; i<numDevices; ++i) { CUdevice dev; checkError("cuDeviceGet", cuDeviceGet(&dev, i)); int clockRate; res = cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); checkError("cuDeviceGetAttribute", res); int numProcessors; res = cuDeviceGetAttribute(&numProcessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); checkError("cuDeviceGetAttribute", res); S32 score = clockRate * numProcessors; if (score > bestScore) { device = dev; bestScore = score; } } if (bestScore == FW_S32_MIN) { fail("No appropriate CUDA device found!"); } return device; }
int CudaModule::getDeviceAttribute(CUdevice_attribute attrib) { staticInit(); if (!s_available) { return 0; } int value; checkError( "cuDeviceGetAttribute", cuDeviceGetAttribute(&value, attrib, s_device)); return value; }
Vec2i CudaModule::selectGridSize(int numBlocks) { CUresult res = CUDA_SUCCESS; int maxWidth; res = cuDeviceGetAttribute(&maxWidth, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, s_device); checkError("cuDeviceGetAttribute", res); Vec2i size(numBlocks, 1); while (size.x > maxWidth) { size.x = (size.x + 1) >> 1; size.y <<= 1; } return size; }
Object cuda_cores(Object self, int nparts, int *argcv, Object *argv, int flags) { cuInit(0); int deviceCount = 0; cuDeviceGetCount(&deviceCount); if (deviceCount == 0) { raiseError("No CUDA devices found"); } CUdevice cuDevice; int mpcount; cuDeviceGetAttribute(&mpcount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice); int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevice); mpcount *= coreMultiplicand(major, minor); return alloc_Float64(mpcount); }
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) { CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); if (CUDA_SUCCESS != error) { fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); } }
bool VideoDecoderCUDAPrivate::initCuda() { CUresult result = cuInit(0); if (result != CUDA_SUCCESS) { available = false; qWarning("cuInit(0) faile (%d)", result); return false; } cudev = GetMaxGflopsGraphicsDeviceId(); int clockRate; cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev); int major, minor; cuDeviceComputeCapability(&major, &minor, cudev); char devname[256]; cuDeviceGetName(devname, 256, cudev); description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000); //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO? CUcontext cuCurrent = NULL; result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { qWarning("cuCtxPopCurrent: %d\n", result); return false; } checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx)); { AutoCtxLock lock(this, vid_ctx_lock); Q_UNUSED(lock); //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING) checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT //require compute capability >= 1.1 //flag: Reserved for future use, must be 0 //cuStreamAddCallback(stream, CUstreamCallback, this, 0); } return true; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main(int argc, char *argv[]) { char *kernel_source; char *kfunc_names[MAX_KERNEL_FUNCTIONS]; int kfunc_index = 0; int target_device = -1; long target_capability = -1; int print_version = 0; int print_devices = 0; int num_devices; int i, opt; int major; int minor; CUdevice device; CUcontext context; CUmodule cuda_module; CUresult rc; /* misc initialization */ cmdname = basename(strdup(argv[0])); cuInit(0); rc = cuDeviceGetCount(&num_devices); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGetCount"); while ((opt = getopt(argc, argv, "k:d:c:s:S:vlh")) >= 0) { switch (opt) { case 'k': if (kfunc_index == MAX_KERNEL_FUNCTIONS) { fputs("Too much kernel function specified", stderr); return 1; } kfunc_names[kfunc_index++] = strdup(optarg); break; case 'd': if (target_device >= 0) { fputs("-d is specified twice or more", stderr); usage(); } if (target_capability >= 0) { fputs("-d and -c are exclusive option", stderr); usage(); } target_device = atoi(optarg); if (target_device < 0 || target_device >= num_devices) { fprintf(stderr, "invalid device: -d %d\n", target_device); usage(); } break; case 'c': if (target_capability >= 0) { fputs("-c is specified twice or more", stderr); usage(); } if (target_device >= 0) { fputs("-d and -c are exclusive option", stderr); usage(); } if (sscanf(optarg, "%d.%d", &major, &minor) != 2) { fprintf(stderr, "invalid capability format: -c %s\n", optarg); usage(); } target_capability = major * 10 + minor; break; case 's': dynamic_shmem_per_thread = atol(optarg); if (dynamic_shmem_per_thread < 0) { fprintf(stderr, "invalid dynamic shmem per thread: %ld\n", dynamic_shmem_per_thread); usage(); } break; case 'S': dynamic_shmem_per_block = atol(optarg); if (dynamic_shmem_per_block < 0) { fprintf(stderr, "invalid dynamic shmem per block: %ld", dynamic_shmem_per_block); usage(); } break; case 'v': print_version = 1; break; case 'l': print_devices = 1; break; case 'h': default: usage(); break; } } if (optind + 1 != argc) { if (print_version || print_devices) { if (print_version) print_nvrtc_version(); if (print_devices) print_cuda_devices(num_devices); return 0; } fputs("no kernel source is specified", stderr); usage(); } kernel_source = argv[optind]; if (target_capability < 0) { CUdevice dev; if (target_device < 0) target_device = 0; /* default device */ rc = cuDeviceGet(&dev, target_device); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGet"); rc = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGetAttribute"); rc = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGetAttribute"); target_capability = 10 * major + minor; } if (print_version) print_nvrtc_version(); if (print_devices) print_cuda_devices(num_devices); /* make a dummy context */ rc = cuDeviceGet(&device, 0); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGet"); rc = cuCtxCreate(&context, 0, device); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuCtxCreate"); cuda_module = build_kernel_source(kernel_source, target_capability); for (i=0; i < kfunc_index; i++) { if (i > 0) putchar('\n'); print_function_attrs(cuda_module, kfunc_names[i]); } /* drop a cuda context */ rc = cuCtxDestroy(context); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuCtxDestroy"); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } /* I know that 512 and 1024 are magic numbers. There is an indication in buffer.h, though. */ if (prop_id < 512) { if (ctx == NULL) return GA_VALUE_ERROR; } else if (prop_id < 1024) { if (buf == NULL) return GA_VALUE_ERROR; } else { if (k == NULL) return GA_VALUE_ERROR; } switch (prop_id) { char *s; CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } /* 256 is what the CUDA API uses so it's good enough for me */ s = malloc(256); if (s == NULL) { cuda_exit(ctx); return GA_MEMORY_ERROR; } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((unsigned int *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: #ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; #endif case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo(&sz, (size_t *)res); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo((size_t *)res, &sz); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((void **)res) = (void *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return GA_INVALID_ERROR; } }
static CUT_THREADPROC dt_thread_func(void *p) { dt_partition *pt = (dt_partition *)p; struct timeval tv; CUresult res; int thread_num_x=0, thread_num_y=0; int block_num_x=0, block_num_y=0; res = cuCtxSetCurrent(ctx[pt->pid]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res)); exit(1); } /* allocate GPU memory */ //printf("part_error_array_num = %d\n",part_error_array_num); if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } int sharedMemBytes = 0; /* get max thread num per block */ int max_threads_num = 0; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* prepare for launch inverse_Q */ void* kernel_args_inverse[] = { &part_C_dev[pt->pid], &pm_size_array_dev[pt->pid], &part_error_array_dev[pt->pid], &part_error_array_num, (void*)&(pt->NoP), &PIDX_array_dev[pt->pid], &numpart_dev[pt->pid], (void*)&(pt->NoC), (void*)&(pt->max_numpart), (void*)&(pt->interval), (void*)&(pt->L_MAX), (void*)&(pt->pid), (void*)&(device_num) }; /* define CUDA block shape */ int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC); int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x; if(upper_limit_th_num_x < 1) upper_limit_th_num_x++; if(upper_limit_th_num_y < 1) upper_limit_th_num_y++; thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x; thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y; block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x; block_num_y = (pt->max_numpart) / thread_num_y; if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; int blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch iverse_Q */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_inverse_Q[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_inverse, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_x */ void* kernel_args_x[] = { &part_C_dev[pt->pid], // FLOAT *src_start &tmpM_dev[pt->pid], // FLOTA *dst &tmpIy_dev[pt->pid], // int *ptr &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, &pm_size_array_dev[pt->pid], // int *size_array (void*)&(pt->NoP), // int NoP &PIDX_array_dev[pt->pid], // int *PIDX_array &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num &numpart_dev[pt->pid], // int *numpart (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; max_threads_num = 64/pt->NoC; if(max_threads_num < 1) max_threads_num++; thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim1 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim1 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch dt1d_x */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_x[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_x, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_y */ void* kernel_args_y[] = { &tmpM_dev[pt->pid], // FLOAT *src_start &M_dev[pt->pid], // FLOAT *dst_start &tmpIx_dev[pt->pid], // int *ptr_start &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, (void*)&(pt->NoP), // int NoP &pm_size_array_dev[pt->pid], // int *size_array &numpart_dev[pt->pid], // int *numpart, &PIDX_array_dev[pt->pid], // int *PIDX_array, (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim0 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim0 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* prepare for launch dt1d_y */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_y[pt->pid], // call functions block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_y, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* downloads datas from GPU */ /* downloads M from GPU */ int sum_part_size = 0; int sum_pointer_size = 0; int sum_move_size = 0; int part_size = 0; int pointer_size = 0; int part_y = 0; int move_size = 0; int start_kk = 0; int end_kk = 0; int part_end_kk = 0; unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M; unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } //if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){ if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; sum_move_size += move_size; // error pt->pid == 2 && L == 24 && jj == 1 if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT)); pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIx from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx; unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIy from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy; unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* end of thread */ CUT_THREADEND; }
static CUresult create_context(void *user_context, CUcontext *ctx) { // Initialize CUDA CUresult err = cuInit(0); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuInit failed (%s)", _get_error_name(err)); return err; } // Make sure we have a device int deviceCount = 0; err = cuDeviceGetCount(&deviceCount); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuGetDeviceCount failed (%s)", _get_error_name(err)); return err; } if (deviceCount <= 0) { halide_error(user_context, "CUDA: No devices available"); return CUDA_ERROR_NO_DEVICE; } int device = halide_get_gpu_device(user_context); if (device == -1) { device = deviceCount - 1; } // Get device CUdevice dev; CUresult status = cuDeviceGet(&dev, device); if (status != CUDA_SUCCESS) { halide_error(user_context, "CUDA: Failed to get device\n"); return status; } DEBUG_PRINTF( user_context, " Got device %d\n", dev ); // Dump device attributes #ifdef DEBUG { char name[256]; name[0] = 0; err = cuDeviceGetName(name, 256, dev); DEBUG_PRINTF(user_context, " %s\n", name); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuDeviceGetName failed (%s)", _get_error_name(err)); return err; } size_t memory = 0; err = cuDeviceTotalMem(&memory, dev); DEBUG_PRINTF(user_context, " total memory: %d MB\n", (int)(memory >> 20)); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuDeviceTotalMem failed (%s)", _get_error_name(err)); return err; } // Declare variables for other state we want to query. int max_threads_per_block = 0, warp_size = 0, num_cores = 0; int max_block_size[] = {0, 0, 0}; int max_grid_size[] = {0, 0, 0}; int max_shared_mem = 0, max_constant_mem = 0; int cc_major = 0, cc_minor = 0; struct {int *dst; CUdevice_attribute attr;} attrs[] = { {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK}, {&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE}, {&num_cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT}, {&max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X}, {&max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y}, {&max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z}, {&max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X}, {&max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y}, {&max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z}, {&max_shared_mem, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK}, {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY}, {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR}, {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR}, {NULL, CU_DEVICE_ATTRIBUTE_MAX}}; // Do all the queries. for (int i = 0; attrs[i].dst; i++) { err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuDeviceGetAttribute failed (%s) for attribute %d", _get_error_name(err), (int)attrs[i].attr); return err; } } // threads per core is a function of the compute capability int threads_per_core = (cc_major == 1 ? 8 : cc_major == 2 ? (cc_minor == 0 ? 32 : 48) : cc_major == 3 ? 192 : cc_major == 5 ? 128 : 0); DEBUG_PRINTF(user_context, " max threads per block: %d\n" " warp size: %d\n" " max block size: %d %d %d\n" " max grid size: %d %d %d\n" " max shared memory per block: %d\n" " max constant memory per block: %d\n" " compute capability %d.%d\n" " cuda cores: %d x %d = %d\n", max_threads_per_block, warp_size, max_block_size[0], max_block_size[1], max_block_size[2], max_grid_size[0], max_grid_size[1], max_grid_size[2], max_shared_mem, max_constant_mem, cc_major, cc_minor, num_cores, threads_per_core, num_cores * threads_per_core); } #endif // Create context DEBUG_PRINTF( user_context, " cuCtxCreate %d -> ", dev ); err = cuCtxCreate(ctx, 0, dev); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) ); halide_error_varargs(user_context, "CUDA: cuCtxCreate failed (%s)", _get_error_name(err)); return err; } else { unsigned int version = 0; cuCtxGetApiVersion(*ctx, &version); DEBUG_PRINTF( user_context, "%p (%d)\n", *ctx, version); } return CUDA_SUCCESS; }
static void calc_a_score_GPU(FLOAT *ac_score, FLOAT **score, int *ssize_start, Model_info *MI, FLOAT scale, int *size_score_array, int NoC) { CUresult res; const int IHEI = MI->IM_HEIGHT; const int IWID = MI->IM_WIDTH; int pady_n = MI->pady; int padx_n = MI->padx; int block_pad = (int)(scale/2.0); struct timeval tv; int *RY_array, *RX_array; res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } for(int i = 0; i < NoC; i++) { int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]}; RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad); RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad); } CUdeviceptr ac_score_dev, score_dev; CUdeviceptr ssize_dev, size_score_dev; CUdeviceptr RY_dev, RX_dev; int size_score=0; for(int i = 0; i < NoC; i++) { size_score += size_score_array[i]; } /* allocate GPU memory */ res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&score_dev, size_score); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&ssize_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&size_score_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RY_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RX_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_start, nullptr); /* upload date to GPU */ res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(score_dev, &score[0][0], size_score); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; void* kernel_args[] = { (void*)&IWID, (void*)&IHEI, (void*)&scale, (void*)&padx_n, (void*)&pady_n, &RX_dev, &RY_dev, &ac_score_dev, &score_dev, &ssize_dev, (void*)&NoC, &size_score_dev }; int sharedMemBytes = 0; /* define CUDA block shape */ int max_threads_num = 0; int thread_num_x, thread_num_y; int block_num_x, block_num_y; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC); NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC); thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0]; thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0]; block_num_x = IWID / thread_num_x; block_num_y = IHEI / thread_num_y; if(IWID % thread_num_x != 0) block_num_x++; if(IHEI % thread_num_y != 0) block_num_y++; gettimeofday(&tv_kernel_start, nullptr); /* launch GPU kernel */ res = cuLaunchKernel( func_calc_a_score[0], // call function block_num_x, // gridDimX block_num_y, // gridDimY 1, // gridDimZ thread_num_x, // blockDimX thread_num_y, // blockDimY NoC, // blockDimZ sharedMemBytes, // sharedMemBytes nullptr, // hStream kernel_args, // kernelParams nullptr // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_kernel_end, nullptr); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; gettimeofday(&tv_memcpy_start, nullptr); /* download data from GPU */ res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* free GPU memory */ res = cuMemFree(ac_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(ssize_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(size_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RY_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RX_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* free CPU memory */ res = cuMemFreeHost(RY_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFreeHost(RX_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } }
void pocl_cuda_init (cl_device_id device, const char *parameters) { CUresult result; result = cuInit (0); CUDA_CHECK (result, "cuInit"); if (device->data) return; pocl_cuda_device_data_t *data = malloc (sizeof (pocl_cuda_device_data_t)); result = cuDeviceGet (&data->device, 0); CUDA_CHECK (result, "cuDeviceGet"); // Get specific device name device->long_name = device->short_name = malloc (256 * sizeof (char)); cuDeviceGetName (device->long_name, 256, data->device); // Get other device properties cuDeviceGetAttribute ((int *)&device->max_work_group_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 0), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 1), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, data->device); cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 2), CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, data->device); cuDeviceGetAttribute ( (int *)&device->local_mem_size, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, data->device); cuDeviceGetAttribute ((int *)&device->max_compute_units, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, data->device); cuDeviceGetAttribute ((int *)&device->max_clock_frequency, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, data->device); cuDeviceGetAttribute ((int *)&device->error_correction_support, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, data->device); cuDeviceGetAttribute ((int *)&device->host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, data->device); cuDeviceGetAttribute ((int *)&device->max_constant_buffer_size, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, data->device); device->preferred_vector_width_char = 1; device->preferred_vector_width_short = 1; device->preferred_vector_width_int = 1; device->preferred_vector_width_long = 1; device->preferred_vector_width_float = 1; device->preferred_vector_width_double = 1; device->preferred_vector_width_half = 0; device->native_vector_width_char = 1; device->native_vector_width_short = 1; device->native_vector_width_int = 1; device->native_vector_width_long = 1; device->native_vector_width_float = 1; device->native_vector_width_double = 1; device->native_vector_width_half = 0; device->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN | CL_FP_DENORM; device->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN | CL_FP_DENORM; device->local_mem_type = CL_LOCAL; device->host_unified_memory = 0; // Get GPU architecture name int sm_maj, sm_min; cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, data->device); cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, data->device); char *gpu_arch = malloc (16 * sizeof (char)); snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min); device->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch); POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", device->llvm_cpu); // Create context result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device); CUDA_CHECK (result, "cuCtxCreate"); // Get global memory size size_t memfree, memtotal; result = cuMemGetInfo (&memfree, &memtotal); device->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024); device->global_mem_size = memtotal; device->data = data; }
static struct ptx_device * nvptx_open_device (int n) { struct ptx_device *ptx_dev; CUdevice dev, ctx_dev; CUresult r; int async_engines, pi; r = cuDeviceGet (&dev, n); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; r = cuCtxGetDevice (&ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) { /* The current host thread has an active context for a different device. Detach it. */ CUcontext old_ctx; r = cuCtxPopCurrent (&old_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); } r = cuCtxGetCurrent (&ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); if (!ptx_dev->ctx) { r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); } else ptx_dev->ctx_shared = true; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->overlap = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->map = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->concur = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->mode = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->mkern = pi; r = cuDeviceGetAttribute (&async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); init_streams_for_device (ptx_dev, async_engines); return ptx_dev; }