示例#1
0
void Caffe::ConnectMasterSlaveDevice(const int master_device_id, const int slave_device_id){


	int can_access;
	int target_device_id;
	target_device_id = slave_device_id;	

	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, master_device_id, slave_device_id));

	if (can_access == 0){
		LOG(WARNING)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<" can not be enabled. Data transfering may be slow.";
	}else{
		CUDA_CHECK(cudaSetDevice(master_device_id));
		CUDA_CHECK(cudaDeviceEnablePeerAccess(target_device_id, 0));
		LOG(INFO)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<"  enabled.";		
	}

	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, slave_device_id, master_device_id ));

	if (can_access == 0){
		LOG(WARNING)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" can not be enabled. Data transfering may be slow.";
	}else{
		CUDA_CHECK(cudaSetDevice(slave_device_id));
		CUDA_CHECK(cudaDeviceEnablePeerAccess(master_device_id, 0));
		LOG(INFO)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" enabled.";
	}
	CUDA_CHECK(cudaSetDevice(master_device_id));
}
int main(int argc, char **argv)
{
	// define the ptr
	int size = WIDTH * HEIGHT;
	float *h_data, *d_send_data, *d_recv_data; 
	bool use_cuda_time = true;

	if(argc != 3) {
		std::cout << "the number of paramter should be equal 2" << std::endl;
		std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl;
		return 1;
	}
	//std::cout << "debug 1" << std::endl;
	int id0 = atoi(argv[1]);
	int id1 = atoi(argv[2]);
	std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl;

	//h_data = new float[size];
	cudaMallocHost(&h_data, size*sizeof(float));
	init_data(h_data, size);

	cudaSetDevice(id0);
	cudaMalloc(&d_send_data, size*sizeof(float));
	cudaSetDevice(id1);
	cudaMalloc(&d_recv_data, size*sizeof(float));
	cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice);

	int can_access_peer_0_1, can_access_peer_1_0;
	cudaSetDevice(id0);
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1));
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0));

	if(can_access_peer_0_1 && can_access_peer_1_0) {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl;
		cudaSetDevice(id0);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0));
		cudaSetDevice(id1);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0));
	} else {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl;
	}

	cudaSetDevice(id1);
	use_cuda_time = false;
	//use_cuda_time = true;
	test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time);

	cudaFreeHost(h_data);
	cudaFree(d_send_data);
	cudaFree(d_recv_data);

	return 0;
}
示例#3
0
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                  int enable)
{
  /* This will perform device bounds checking for us */
  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

  if (enable != prevEnabled) {
    /* If we're attempting to enable p2p access but p2p access isn't */
    /* supported, throw an error */
    if (enable) {
      int access = 0;
      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

      if (!access) {
        THError("p2p access not supported for %d accessing %d",
                dev, devToAccess);
      }
    }

    state->p2pAccessEnabled[dev][devToAccess] = enable;

    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    /* This should be in sync with the current access state */
    if (enable) {
      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
    } else {
      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
}
示例#4
0
void THCudaInit(THCudaState* state)
{
  int count = 0;
  THCudaCheck(cudaGetDeviceCount(&count));

  int device = 0;
  THCudaCheck(cudaGetDevice(&device));

  state->rngState = (THCudaRNGState*)malloc(sizeof(THCudaRNGState));
  THCRandom_init(state->rngState, count, device);

  THCudaBlas_init(count, device);

  int i,j;
  for(i=0; i < count; ++i)
  {
    THCudaCheck(cudaSetDevice(i));
    for (j=0; j < count; ++j)
    {
      if(i != j)
      {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));
        if(can)
          THCudaCheck(cudaDeviceEnablePeerAccess(j, 0));
      }
    }
  }
  THCudaCheck(cudaSetDevice(device));
}
示例#5
0
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
  if (dev < 0 || dev >= state->numDevices) {
    THError("%d is not a device", dev);
  }
  if (devToAccess < 0 || devToAccess >= state->numDevices) {
    THError("%d is not a device", devToAccess);
  }
  if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    int access = 0;
    THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
    if (access) {
      cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
      if (err == cudaErrorPeerAccessAlreadyEnabled) {
        // ignore and clear the error if access was already enabled
        cudaGetLastError();
      } else {
        THCudaCheck(err);
      }
      state->p2pAccessEnabled[dev][devToAccess] = 1;
    } else {
      state->p2pAccessEnabled[dev][devToAccess] = 0;
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
  return state->p2pAccessEnabled[dev][devToAccess];
}
示例#6
0
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  /* By default, all direct p2p kernel access (besides copy) is disallowed, */
  /* since direct access without knowing whether or not a certain operation */
  /* should be cross-GPU leads to synchronization errors. The user can choose */
  /* to disable this functionality, however. */
  state->p2pKernelAccessEnabled = 0;

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
  for (int i = 0; i < numDevices; ++i) {
    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
  }

  /* Build a table of all allowed p2p accesses, to avoid checking the p2p
     status at runtime. */
  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      /* Presume no access by default */
      state->p2pAccessEnabled[i][j] = 0;

      if (i == j) {
        /* A GPU can access itself */
        state->p2pAccessEnabled[i][j] = 1;
      } else {
        int access = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

        if (access) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            /* Any future call to cudaGetLastError will now return an error, */
            /* even though we've already dealt with this specific error here. */
            /* Call cudaGetLastError once to reset the last error state. */
            cudaGetLastError();
            continue;
          }

          /* In case there are unknown errors returned from the above */
          THCudaCheck(err);

          /* Access could be enabled */
          state->p2pAccessEnabled[i][j] = 1;
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
示例#7
0
TEST(PeerAccess, EnableDisable) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    if (devices <= 1) {
        return;
    }

    int version;
    ret = cudaRuntimeGetVersion(&version);
    ASSERT_EQ(cudaSuccess, ret);

    typedef std::pair<int, int> peer_t;
    std::vector<peer_t> peers;

    for (int i = 0; i < devices; i++) {
        ret = cudaSetDevice(i);
        ASSERT_EQ(cudaSuccess, ret);

        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            ASSERT_EQ(cudaSuccess, ret);

            cudaError_t expected;

            if (peer) {
                expected = cudaSuccess;
                peers.push_back(peer_t(i, j));
            #if CUDA_VERSION >= 5000
            } else if (version >= 5000 /* 5.0 */) {
                expected = cudaErrorPeerAccessUnsupported;
            #endif
            } else {
                expected = cudaErrorInvalidDevice;
            }

            ret = cudaDeviceEnablePeerAccess(j, 0);
            EXPECT_EQ(expected, ret);
        }
    }

    /* Cleanup. */
    const size_t n_peers = peers.size();

    for (size_t i = 0; i < n_peers; i++) {
        ret = cudaSetDevice(peers[i].first);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaDeviceDisablePeerAccess(peers[i].second);
        EXPECT_EQ(cudaSuccess, ret);
    }
}
示例#8
0
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
  for (int i = 0; i < numDevices; ++i) {
    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
  }

  /* Build a table of all allowed p2p accesses, to avoid checking the p2p
     status at runtime. */

  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      /* Presume no access by default */
      state->p2pAccessEnabled[i][j] = 0;

      if (i == j) {
        /* A GPU can access itself */
        state->p2pAccessEnabled[i][j] = 1;
      } else {
        int access = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

        if (access) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            /* Any future call to cudaGetLastError will now return an error, */
            /* even though we've already dealt with this specific error here. */
            /* Call cudaGetLastError once to reset the last error state. */
            cudaGetLastError();
            continue;
          }

          /* In case there are unknown errors returned from the above */
          THCudaCheck(err);

          /* Access could be enabled */
          state->p2pAccessEnabled[i][j] = 1;
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
示例#9
0
文件: gpuops.c 项目: frankong/bart
void cuda_p2p_table(int n, bool table[n][n])
{
	assert(n == cuda_devices());

	for (int i = 0; i < n; i++) {
		for (int j = 0; j < n; j++) {


			int r;
			CUDA_ERROR(cudaDeviceCanAccessPeer(&r, i, j));

			table[i][j] = (1 == r);
		}
	}
}
示例#10
0
TEST(PeerAccess, CanAccessInvalidDevice) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    int device;
    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    int peer;
    ret = cudaDeviceCanAccessPeer(&peer, device, devices);
    EXPECT_EQ(cudaErrorInvalidDevice, ret);
}
示例#11
0
P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
                        P2PSync<Dtype>* parent, const SolverParameter& param)
    : GPUParams<Dtype>(root_solver, param.device_id()),
      parent_(parent),
      children_(),
      queue_(),
      initial_iter_(root_solver->iter()),
      solver_() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = param.device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent == NULL) {
        solver_ = root_solver;
    } else {
        Caffe::set_root_solver(false);
        solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
        Caffe::set_root_solver(true);
    }
    this->configure(solver_.get());
    solver_->add_callback(this);

    if (parent) {
        // Enable p2p access between devices
        const int peer = parent->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
        } else {
            LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
        }
        // Allocate receiving buffer on parent
        CUDA_CHECK(cudaSetDevice(peer));
        CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
        CUDA_CHECK(cudaSetDevice(self));
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#else
    NO_GPU;
#endif
}
示例#12
0
TEST(PeerAccess, DeviceReset) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    if (devices <= 1) {
        return;
    }

    bool found = false;
    int dj;

    for (int i = 0; i < devices && !(found); i++) {
        ret = cudaSetDevice(i);
        ASSERT_EQ(cudaSuccess, ret);

        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            ASSERT_EQ(cudaSuccess, ret);

            if (peer) {
                ret = cudaDeviceEnablePeerAccess(j, 0);
                ASSERT_EQ(cudaSuccess, ret);

                found = true;
                dj = j;
                break;
            }
        }
    }

    if (!(found)) {
        return;
    }

    /* Perform a device reset. */
    ret = cudaDeviceReset();
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaDeviceDisablePeerAccess(dj);
    EXPECT_EQ(cudaErrorPeerAccessNotEnabled, ret);
}
示例#13
0
TEST(PeerAccess, CanAccess) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    for (int i = 0; i < devices; i++) {
        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            EXPECT_EQ(cudaSuccess, ret);

            if (i == j) {
                EXPECT_FALSE(peer);
            }
        }
    }
}
示例#14
0
P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = solver_->param().device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent_) {
        CUDA_CHECK(cudaFree(parent_grads_));
        const int peer = parent_->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
        }
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}
示例#15
0
static int can_use_fastpath(lua_State *L, int sock, uint32_t bind_addr, uint32_t addr) {
#if defined(USE_CUDA) && !defined(__APPLE__)
   if (bind_addr == addr) {
      int device;
      THCudaCheck(cudaGetDevice(&device));
      int ret = send(sock, &device, sizeof(device), 0);
      if (ret < 0) {
         close(sock);
         return LUA_HANDLE_ERROR(L, errno);
      }
      int remote_device;
      ret = recv(sock, &remote_device, sizeof(remote_device), 0);
      if (ret <= 0) {
         close(sock);
         return LUA_HANDLE_ERROR(L, errno);
      }
      if (device != remote_device) {
         int can;
         THCudaCheck(cudaDeviceCanAccessPeer(&can, device, remote_device));
         if (can) {
            cudaError_t err = cudaDeviceEnablePeerAccess(remote_device, 0);
            if (err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled) {
               if (err == cudaErrorPeerAccessAlreadyEnabled) cudaGetLastError();
               fprintf(stderr, "INFO: torch-ipc: CUDA IPC enabled between GPU%d and GPU%d\n", device, remote_device);
               return 1;
            } else {
               fprintf(stderr, "WARN: torch-ipc: CUDA IPC disabled between GPU%d and GPU%d: %s\n", device, remote_device, cudaGetErrorString(err));
            }
         } else {
            fprintf(stderr, "INFO: torch-ipc: CUDA IPC not possible between GPU%d and GPU%d\n", device, remote_device);
         }
      }
   }
#else
   (void)L;
   (void)sock;
   (void)bind_addr;
   (void)addr;
#endif
   return 0;
}
示例#16
0
void THCudaInit(THCState* state)
{
  int count = 0;
  THCudaCheck(cudaGetDeviceCount(&count));

  int device = 0;
  THCudaCheck(cudaGetDevice(&device));

  state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
  THCRandom_init(state, count, device);

  state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState));
  THCudaBlas_init(state, count, device);

  int i,j;
  for(i=0; i < count; ++i)
  {
    THCudaCheck(cudaSetDevice(i));
    for (j=0; j < count; ++j)
    {
      if(i != j)
      {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));
        if(can) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            // Any future call to cudaGetLastError will now return an error,
            // even though we've already dealt with this specific error here.
            // Call cudaGetLastError once to reset the last error state.
            cudaGetLastError();

            continue;
          }
          THCudaCheck(err);
        }
      }
    }
  }
  THCudaCheck(cudaSetDevice(device));
}
示例#17
0
文件: THCGeneral.c 项目: noa/cutorch
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      if (i != j) {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));

        if (can) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            // Any future call to cudaGetLastError will now return an error,
            // even though we've already dealt with this specific error here.
            // Call cudaGetLastError once to reset the last error state.
            cudaGetLastError();

            continue;
          }

          THCudaCheck(err);
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
示例#18
0
magma_int_t
magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){


    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
     (cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("Two or more Tesla(s) with (SM 2.0)"
                        " class GPUs are required for P2P.\n");
    }

    // Query device properties
    cudaDeviceProp prop[64];
    int gpuid_tesla[64]; // find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++) {
         (cudaGetDeviceProperties(&prop[i], i));
        // Only Tesla boards based on Fermi can support P2P
        {
            // This is an array of P2P capable GPUs
            gpuid_tesla[gpu_count++] = i;
        }
    }
    *num_gpus=gpu_n;

     for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
  // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
    int can_access_peer_0_1, can_access_peer_1_0;
    // In this case we just pick the first two that we can support
     (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], 
                                                        gpuid_tesla[j]));
     (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], 
                                                        gpuid_tesla[i]));


    // Output results from P2P capabilities
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],                                                                  
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j] ,
                             can_access_peer_0_1 ? "Yes" : "No");
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j],
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],
                            can_access_peer_1_0 ? "Yes" : "No");

    if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
    {
        printf("Two or more Tesla(s) with class"
                " GPUs are required for P2P to run.\n");
        printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between"
        " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]);
        printf("PASSED\n");
        //exit(EXIT_SUCCESS);
    }
     }    
    }

  // Enable peer access
     for(int i=0; i<gpu_n; i++)
     {
         for(int j=i+1; j<gpu_n; j++)
         {
             printf("Enabling peer access between GPU%d and GPU%d...\n",
                gpuid_tesla[i], gpuid_tesla[j]);
              (cudaSetDevice(gpuid_tesla[i]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0));
              (cudaSetDevice(gpuid_tesla[j]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0));
           magma_dcheckerr("P2P");
         }
     }

   magma_dcheckerr("P2P successful");


    // Enable peer access
    for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Check that we got UVA on both devices
    printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 
    gpuid_tesla[i], gpuid_tesla[j]);
    //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && 
    //                            prop[gpuid_tesla[j]].unifiedAddressing);

    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, 
    gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") );
    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, 
    gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") );

        }
    }



  if(*bw_bmark==1){


    // P2P memcopy() benchmark
   for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", 
                int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]);
    (cudaSetDevice(gpuid_tesla[i]));
    float* g0;
    (cudaMalloc(&g0, buf_size));
    (cudaSetDevice(gpuid_tesla[j]));
    float* g1;
    (cudaMalloc(&g1, buf_size));
    float* h0;
    (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float time_memcpy;
    int eventflags = cudaEventBlockingSync;
    (cudaEventCreateWithFlags(&start_event, eventflags));
    (cudaEventCreateWithFlags(&stop_event, eventflags));


    (cudaEventRecord(start_event, 0));
    for (int k=0; k<100; k++)
    {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
            
        // Ping-pong copy between GPUs
        if (k % 2 == 0)
            (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        else
            (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
    (cudaEventRecord(stop_event, 0));
    (cudaEventSynchronize(stop_event));
    (cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between"
            "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j],
        (1.0f / (time_memcpy / 1000.0f)) 
            * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);


     // Cleanup and shutdown
    printf("Cleanup of P2P benchmark...\n");
    (cudaEventDestroy(start_event));
    (cudaEventDestroy(stop_event));
    (cudaSetDevice(gpuid_tesla[i]));
    (magma_free( g0) );
    (cudaSetDevice(gpuid_tesla[j]));
    (magma_free( g1) );
    (magma_free_cpu( h0) );

    }
    }

    // host-device memcopy() benchmark

        for(int j=0; j<gpu_n; j++)
        {
    cudaSetDevice(gpuid_tesla[j]);

    int *h_data_source;
    int *h_data_sink;

    int *h_data_in[STREAM_COUNT];
    int *d_data_in[STREAM_COUNT];

    int *h_data_out[STREAM_COUNT];
    int *d_data_out[STREAM_COUNT];


    cudaEvent_t cycleDone[STREAM_COUNT];
    cudaStream_t stream[STREAM_COUNT];

    cudaEvent_t start, stop;

    // Allocate resources
    int memsize;
    memsize = 1000000 * sizeof(int);

    h_data_source = (int*) malloc(memsize);
    h_data_sink = (int*) malloc(memsize);    

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        ( cudaHostAlloc(&h_data_in[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_in[i], memsize) );
        
        ( cudaHostAlloc(&h_data_out[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_out[i], memsize) );

        
        ( cudaStreamCreate(&stream[i]) );
        ( cudaEventCreate(&cycleDone[i]) ); 
        
        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start); cudaEventCreate(&stop);

    

    // Time host-device copies
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, 
        cudaMemcpyHostToDevice,0) );
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_h2d_time;    
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, 
        cudaMemcpyDeviceToHost, 0) );        
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_d2h_time;    
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    
    cudaEventSynchronize(stop);
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j,
        memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time );
    printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j,
        memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);

    // Free resources

    free( h_data_source );
    free( h_data_sink );

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        magma_free_cpu( h_data_in[i] );
        magma_free( d_data_in[i] );

        magma_free_cpu( h_data_out[i] );
        magma_free( d_data_out[i] );
        
        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);        
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
   }

  }//end if-loop bandwidth_benchmark

    magma_dcheckerr("P2P established");

    return MAGMA_SUCCESS;

}
示例#19
0
void mem_control_kernel_float(float *starting_point_A, float **A_dev,
                        LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim,
                        int *mem_cpy_counter, reader_tracker *addr_track,
                        cudaStream_t *stream,
                        int nrowa_dev, int ncola_dev, int lda) {
    rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map));
    if( block_A == NULL ) { //new element
        //fprintf(stderr, "==========new element========\n");
        //traverse_LRU_se(LRU);
        int search_l_GPU = GPU_id-1;
        int search_r_GPU = GPU_id+1;
        rbt_node *block_A_l = NULL;
        rbt_node *block_A_r = NULL;
        while (block_A_l == NULL && block_A_r == NULL) {
            if (search_l_GPU >= 0) {
                block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map));
                if (block_A_l != NULL) {
                    if (block_A_l->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU);
                        if(peer_access_check == 1) block_A_l = NULL;
                    }
                }
                search_l_GPU--;
            }
            if (search_r_GPU < GPUs) {
                block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map));
                if (block_A_r != NULL) {
                    if (block_A_r->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU);
                        if(peer_access_check == 1) block_A_r = NULL;
                    }
                }
                search_r_GPU++;
            }
            if (search_l_GPU < 0 && search_r_GPU >= GPUs) {
                break;
            }
        }
        //rectitfication
        search_l_GPU++; search_r_GPU--;
        assert(search_l_GPU >= 0 && search_l_GPU < GPUs);
        assert(search_r_GPU >= 0 && search_r_GPU < GPUs);
        
        if ( !(block_A_l == NULL && block_A_r == NULL) ) {
            //inter GPU communication
            int target_GPU_id = 0;
            if (block_A_l != NULL && block_A_r != NULL) {
                if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else {
                    int rand_select = rand()%10;
                    if (rand_select < 5) {
                        target_GPU_id = search_l_GPU;
                        block_A       = block_A_l;
                    } else {
                        target_GPU_id = search_r_GPU;
                        block_A       = block_A_r;
                    }
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //fprintf(stderr, "==>3  block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r);
            } else {
                if (block_A_l != NULL && block_A_r == NULL) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else if(block_A_r != NULL && block_A_l == NULL) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //printf("==>2  block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done);
            }
            if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL)
            goto new_block;
            atomic_reader_plus(block_A);
            *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1);
            assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess );
            //cannot dequeue the GPU mem at the target GPU
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = target_GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 1;
            (*mem_cpy_counter) += 1;
            //cannnot dequeue the current new GPU mem
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        } else {
        new_block:
            //(block_A_r == NULL && block_A_l == NULL) {
            //bring new blocks
            //printf("==>1  bring new block to GPU:%d\n", GPU_id);
            (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
            addr_track[*mem_cpy_counter].addr          = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id        = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        }
    } else {
        atomic_reader_plus(block_A);
        assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL);
        *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]);
        addr_track[*mem_cpy_counter].addr   = starting_point_A;
        addr_track[*mem_cpy_counter].GPU_id = GPU_id;
        (*mem_cpy_counter) += 1;
    }
}
示例#20
0
magma_int_t magma_buildconnection_mgpu(  magma_int_t gnode[MagmaMaxGPUs+2][MagmaMaxGPUs+2], magma_int_t *nbcmplx, magma_int_t ngpu)
{
    magma_int_t *deviceid = (magma_int_t *) malloc(ngpu*sizeof(magma_int_t));
    memset(deviceid,0,ngpu*sizeof(magma_int_t));

    nbcmplx[0] =0;


    //printf(" Initializing....\n\n");
    //printf(" This machine has %d GPU\n",ngpu);

    //printf(" cudaSuccess %d, cudaErrorInvalidDevice %d, cudaErrorPeerAccessAlreadyEnabled %d, cudaErrorInvalidValue %d \n",    cudaSuccess, cudaErrorInvalidDevice,cudaErrorPeerAccessAlreadyEnabled, cudaErrorInvalidValue );

    int samecomplex=-1;
    cudaError_t err,scerr;
    cudaDeviceProp prop;

    magma_int_t cmplxnb = 0;
    magma_int_t cmplxid = 0;
    magma_int_t lcgpunb = 0;
    for( magma_int_t d = 0; d < ngpu; ++d ) {
        // check for unified memory & enable peer memory access between all GPUs.            
        magma_setdevice( d );
        cudaGetDeviceProperties( &prop, d );
        if ( ! prop.unifiedAddressing ) {
            printf( "device %d doesn't support unified addressing\n", (int) d );
            return -1;
        }
        // add this device to the list if not added yet.
        // not added yet meaning belong to a new complex
        if(deviceid[d]==0){
            cmplxnb           = cmplxnb+1;
            cmplxid           = cmplxnb-1;
            gnode[cmplxid][MagmaMaxGPUs] = 1;
            lcgpunb           = gnode[cmplxid][MagmaMaxGPUs]-1;
            gnode[cmplxid][lcgpunb] = d;
            deviceid[d]=-1;
        }
        //printf("DEVICE %d : \n",d);

        for( magma_int_t d2 = d+1; d2 < ngpu; ++d2 ) {
            // check for unified memory & enable peer memory access between all GPUs.            
            magma_setdevice( d2 );
            cudaGetDeviceProperties( &prop, d2 );
            if ( ! prop.unifiedAddressing ) {
                printf( "device %d doesn't support unified addressing\n", (int) d2 );
                return -1;
            }

            scerr = cudaDeviceCanAccessPeer(&samecomplex,d,d2); 

            //printf(" device %d and device %d have samecomplex= %d\n",d,d2,samecomplex);
            if(samecomplex==1){
                // d and d2 are on the same complex so add them, note that d is already added
                // so just enable the peer Access for d and enable+add d2.
                // FOR d:
                magma_setdevice( d );
                err   = cudaDeviceEnablePeerAccess( d2, 0 );
                //printf("enabling devide %d ==> %d  error %d\n",d,d2,err);
                if ( err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled ) {
                    printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d2, (int) err );
                    return -2;
                }

                // FOR d2:
                magma_setdevice( d2 );
                err   = cudaDeviceEnablePeerAccess( d, 0 );
                //printf("enabling devide %d ==> %d  error %d\n",d2,d,err);
                if((err==cudaSuccess)||(err==cudaErrorPeerAccessAlreadyEnabled)){
                    if(deviceid[d2]==0){
                        //printf("adding device %d\n",d2);
                        gnode[cmplxid][MagmaMaxGPUs] = gnode[cmplxid][MagmaMaxGPUs]+1;
                        lcgpunb           = gnode[cmplxid][MagmaMaxGPUs]-1;
                        gnode[cmplxid][lcgpunb] = d2;
                        deviceid[d2]=-1;
                    }
                }else{
                    printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d, (int) err );
                    return -2;
                }
            }
        }
    }

    nbcmplx[0] = cmplxnb; 
    return cmplxnb;
}
示例#21
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    int dev, driverVersion = 0, runtimeVersion = 0;

    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        SPRINTF(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("%s", msg);

        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);

        if (deviceProp.l2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }

#else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        if (L2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

#endif

        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);


        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

        const char *sComputeMode[] =
        {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }

    // If there are 2 or more GPUs, query to determine whether RDMA is supported
    if (deviceCount >= 2)
    {
        cudaDeviceProp prop[64];
        int gpuid[64]; // we want to find the first two GPUs that can support P2P
        int gpu_p2p_count = 0;

        for (int i=0; i < deviceCount; i++)
        {
            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

            // Only boards based on Fermi or later can support P2P
            if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this
                && prop[i].tccDriver
#endif
               )
            {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }

        // Show all the combinations of support P2P GPUs
        int can_access_peer;

        if (gpu_p2p_count >= 2)
        {
            for (int i = 0; i < gpu_p2p_count; i++)
            {
                for (int j = 0; j < gpu_p2p_count; j++)
                {
                    if (gpuid[i] == gpuid[j])
                    {
                        continue;
                    }
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                        printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
                           prop[gpuid[j]].name, gpuid[j] ,
                           can_access_peer ? "Yes" : "No");
                }
            }
        }
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[16];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        sprintf_s(cTemp, 13, ", Device%d = ", dev);
#else
        sprintf(cTemp, ", Device%d = ", dev);
#endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }

    sProfileString += "\n";
    printf("%s", sProfileString.c_str());

    printf("Result = PASS\n");

    // finish
    exit(EXIT_SUCCESS);
}
示例#22
0
void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
#ifndef CPU_ONLY
    vector<int> remaining(devices);

    // Depth for reduction tree
    int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));

    // Group GPUs by board
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                cudaDeviceProp a, b;
                CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
                CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
                if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
                    if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
                        pairs->push_back(DevicePair(remaining[i], remaining[j]));
                        DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
                        remaining.erase(remaining.begin() + j);
                        break;
                    }
                }
            }
        }
    }
    ostringstream s;
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();

    // Group by P2P accessibility
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                int access;
                CUDA_CHECK(
                    cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
                if (access) {
                    pairs->push_back(DevicePair(remaining[i], remaining[j]));
                    DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
                    remaining.erase(remaining.begin() + j);
                    break;
                }
            }
        }
    }
    s.str("");
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();

    // Group remaining
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
            DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
                       << remaining[i + 1];
            remaining.erase(remaining.begin() + i + 1);
        }
    }

    // Should only be the parent node remaining
    CHECK_EQ(remaining.size(), 1);

    pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));

    CHECK(pairs->size() == devices.size());
    for (int i = 0; i < pairs->size(); ++i) {
        CHECK((*pairs)[i].parent() != (*pairs)[i].device());
        for (int j = i + 1; j < pairs->size(); ++j) {
            CHECK((*pairs)[i].device() != (*pairs)[j].device());
        }
    }
#else
    NO_GPU;
#endif
}