gaspi_return_t gaspi_gpu_init(void) { gaspi_context_t * const gctx = &glb_gaspi_ctx; int deviceCount; cudaError_t cuda_error_id = cudaGetDeviceCount(&deviceCount); if( cuda_error_id != cudaSuccess ) { gaspi_print_error("Failed cudaGetDeviceCount." ); return GASPI_ERR_DEVICE; } if( deviceCount <= 0 ) { gaspi_print_error("No CUDA capable devices found."); return GASPI_ERR_DEVICE; } const int ib_numa_node = _gaspi_find_dev_numa_node(); int device_id = 0; int gaspi_devices = 0; int direct_devices[GPI2_GPU_MAX_DIRECT_DEVS]; struct cudaDeviceProp deviceProp; for(device_id = 0; device_id < deviceCount; device_id++) { //TODO: possibly add functionality to show properties structure cuda_error_id = cudaGetDeviceProperties(&deviceProp, device_id); if( cuda_error_id != cudaSuccess) { return GASPI_ERR_DEVICE; } if( deviceProp.major >= 3 ) /* TODO: magic number */ { cuda_error_id = cudaSetDevice(device_id); if( cuda_error_id != cudaSuccess ) { return GASPI_ERR_DEVICE; } if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) ) { if( gaspi_devices < GPI2_GPU_MAX_DIRECT_DEVS - 1 ) { direct_devices[gaspi_devices] = device_id; gaspi_devices++; } } } } if( 0 == gaspi_devices ) { gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found."); return GASPI_ERROR; } gpus = (gaspi_gpu_t*) malloc(sizeof(gaspi_gpu_t) * gaspi_devices); if( gpus == NULL ) { gaspi_print_error("Failed to allocate memory."); return GASPI_ERR_MEMALLOC; } int i, j, k; for(k = 0 ; k < gaspi_devices; k++) { cuda_error_id = cudaSetDevice(direct_devices[k]); if( cuda_error_id != cudaSuccess ) { return GASPI_ERR_DEVICE; } for(i = 0; i < GASPI_MAX_QP; i++) { cuda_error_id = cudaStreamCreate(&gpus[k].streams[i]); if( cuda_error_id != cudaSuccess ) { return GASPI_ERR_DEVICE; } for(j = 0; j < GASPI_CUDA_EVENTS; j++) { cuda_error_id = cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming); if( cuda_error_id != cudaSuccess ) { return GASPI_ERR_DEVICE; } } cuda_error_id = cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking); if( cuda_error_id != cudaSuccess ) { return GASPI_ERR_DEVICE; } } gpus[k].device_id = direct_devices[k]; } gctx->gpu_count = gaspi_devices; gctx->use_gpus = 1; return GASPI_SUCCESS; }
gaspi_return_t gaspi_init_GPUs() { int i, j, k; int deviceCount; int device_id = 0; int gaspi_devices = 0; int ib_numa_node; int direct_devices[32]; struct cudaDeviceProp deviceProp; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if ( error_id != cudaSuccess ) { gaspi_print_error("Failed cudaGetDeviceCount." ); return GASPI_ERROR; } if( deviceCount <= 0 ) { gaspi_print_error("No CUDA capable devices found."); return GASPI_ERROR; } ib_numa_node = _gaspi_find_GPU_ib_numa_node(); for(device_id = 0; device_id < deviceCount; device_id++) { cudaGetDeviceProperties(&deviceProp, device_id); if( deviceProp.major >= 3 ) /* TODO: magic number */ { cudaSetDevice(device_id); if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) ) { direct_devices[gaspi_devices] = device_id; gaspi_devices++; } } } if( 0 == gaspi_devices ) { gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found."); return GASPI_ERROR; } glb_gaspi_ctx.gpu_count = gaspi_devices; gpus = (gaspi_gpu *) malloc(sizeof(gaspi_gpu)*glb_gaspi_ctx.gpu_count); if( !gpus ) { gaspi_print_error("Failed to allocate mameory."); return GASPI_ERR_MEMALLOC; } for(k = 0 ; k < gaspi_devices; k++) { cudaSetDevice(direct_devices[k]); for( i = 0; i < GASPI_MAX_QP; i++) { cudaStreamCreate(&gpus[k].streams[i]); for(j = 0; j < GASPI_CUDA_EVENTS; j++) { cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming); } cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking); } gpus[k].device_id = direct_devices[k]; } glb_gaspi_ctx.use_gpus = 1; return GASPI_SUCCESS; }