Example #1
0
gaspi_return_t
gaspi_gpu_init(void)
{
  gaspi_context_t * const gctx = &glb_gaspi_ctx;
  int deviceCount;
  cudaError_t cuda_error_id = cudaGetDeviceCount(&deviceCount);
  if( cuda_error_id != cudaSuccess )
    {
      gaspi_print_error("Failed cudaGetDeviceCount." );
      return GASPI_ERR_DEVICE;
    }

  if( deviceCount <= 0 )
    {
      gaspi_print_error("No CUDA capable devices found.");
      return GASPI_ERR_DEVICE;
    }

  const int ib_numa_node = _gaspi_find_dev_numa_node();

  int device_id = 0;
  int gaspi_devices = 0;
  int direct_devices[GPI2_GPU_MAX_DIRECT_DEVS];
  struct cudaDeviceProp deviceProp;
  for(device_id = 0; device_id < deviceCount; device_id++)
    {
      //TODO: possibly add functionality to show properties structure
      cuda_error_id = cudaGetDeviceProperties(&deviceProp, device_id);
      if( cuda_error_id != cudaSuccess)
	{
	  return GASPI_ERR_DEVICE;
	}

      if( deviceProp.major >= 3 ) /* TODO: magic number */
	{
	  cuda_error_id = cudaSetDevice(device_id);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	  if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) )
	    {
	      if( gaspi_devices < GPI2_GPU_MAX_DIRECT_DEVS - 1 )
		{
		  direct_devices[gaspi_devices] = device_id;
		  gaspi_devices++;
		}
	    }
	}
    }

  if( 0 == gaspi_devices )
    {
      gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found.");
      return GASPI_ERROR;
    }

  gpus = (gaspi_gpu_t*) malloc(sizeof(gaspi_gpu_t) * gaspi_devices);
  if( gpus == NULL )
    {
      gaspi_print_error("Failed to allocate memory.");
      return GASPI_ERR_MEMALLOC;
    }

  int i, j, k;
  for(k = 0 ; k < gaspi_devices; k++)
    {
      cuda_error_id = cudaSetDevice(direct_devices[k]);
      if( cuda_error_id != cudaSuccess )
	{
	  return GASPI_ERR_DEVICE;
	}

      for(i = 0; i < GASPI_MAX_QP; i++)
	{
	  cuda_error_id = cudaStreamCreate(&gpus[k].streams[i]);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	  for(j = 0; j < GASPI_CUDA_EVENTS; j++)
	    {
	      cuda_error_id = cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming);
	      if( cuda_error_id != cudaSuccess )
		{
		  return GASPI_ERR_DEVICE;
		}
	    }

	  cuda_error_id = cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	}

      gpus[k].device_id = direct_devices[k];
    }

  gctx->gpu_count = gaspi_devices;
  gctx->use_gpus = 1;

  return GASPI_SUCCESS;
}
Example #2
0
gaspi_return_t
gaspi_init_GPUs()
{
  int i, j, k;
  int deviceCount;
  int device_id = 0;
  int gaspi_devices = 0;
  int ib_numa_node;
  int direct_devices[32];
  struct cudaDeviceProp deviceProp;

  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
  if ( error_id != cudaSuccess )
  {
    gaspi_print_error("Failed cudaGetDeviceCount." );
    return GASPI_ERROR;
  }

  if( deviceCount <= 0 )
  {
    gaspi_print_error("No CUDA capable devices found.");
    return GASPI_ERROR;
  }

  ib_numa_node = _gaspi_find_GPU_ib_numa_node();

  for(device_id = 0; device_id < deviceCount; device_id++)
  {
    cudaGetDeviceProperties(&deviceProp, device_id);
    if( deviceProp.major >= 3 ) /* TODO: magic number */
    {
      cudaSetDevice(device_id);
      if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) )
      {
	direct_devices[gaspi_devices] = device_id;
	gaspi_devices++;
      }
    }
  }

  if( 0 == gaspi_devices )
  {
    gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found.");
    return GASPI_ERROR;
  }

  glb_gaspi_ctx.gpu_count = gaspi_devices;

  gpus = (gaspi_gpu *) malloc(sizeof(gaspi_gpu)*glb_gaspi_ctx.gpu_count);
  if( !gpus )
    {
      gaspi_print_error("Failed to allocate mameory.");
      return GASPI_ERR_MEMALLOC;
    }

  for(k = 0 ; k < gaspi_devices; k++)
  {
    cudaSetDevice(direct_devices[k]);

    for( i = 0; i < GASPI_MAX_QP; i++)
    {
      cudaStreamCreate(&gpus[k].streams[i]);
      for(j = 0; j < GASPI_CUDA_EVENTS; j++)
      {
	cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming);
      }

      cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking);
    }

    gpus[k].device_id = direct_devices[k];
  }

  glb_gaspi_ctx.use_gpus = 1;

  return GASPI_SUCCESS;
}