void Caffe::ConnectMasterSlaveDevice(const int master_device_id, const int slave_device_id){ int can_access; int target_device_id; target_device_id = slave_device_id; CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, master_device_id, slave_device_id)); if (can_access == 0){ LOG(WARNING)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<" can not be enabled. Data transfering may be slow."; }else{ CUDA_CHECK(cudaSetDevice(master_device_id)); CUDA_CHECK(cudaDeviceEnablePeerAccess(target_device_id, 0)); LOG(INFO)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<" enabled."; } CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, slave_device_id, master_device_id )); if (can_access == 0){ LOG(WARNING)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" can not be enabled. Data transfering may be slow."; }else{ CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaDeviceEnablePeerAccess(master_device_id, 0)); LOG(INFO)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" enabled."; } CUDA_CHECK(cudaSetDevice(master_device_id)); }
int main(int argc, char **argv) { // define the ptr int size = WIDTH * HEIGHT; float *h_data, *d_send_data, *d_recv_data; bool use_cuda_time = true; if(argc != 3) { std::cout << "the number of paramter should be equal 2" << std::endl; std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl; return 1; } //std::cout << "debug 1" << std::endl; int id0 = atoi(argv[1]); int id1 = atoi(argv[2]); std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl; //h_data = new float[size]; cudaMallocHost(&h_data, size*sizeof(float)); init_data(h_data, size); cudaSetDevice(id0); cudaMalloc(&d_send_data, size*sizeof(float)); cudaSetDevice(id1); cudaMalloc(&d_recv_data, size*sizeof(float)); cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice); int can_access_peer_0_1, can_access_peer_1_0; cudaSetDevice(id0); CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1)); CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0)); if(can_access_peer_0_1 && can_access_peer_1_0) { std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl; cudaSetDevice(id0); CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0)); cudaSetDevice(id1); CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0)); } else { std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl; } cudaSetDevice(id1); use_cuda_time = false; //use_cuda_time = true; test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time); cudaFreeHost(h_data); cudaFree(d_send_data); cudaFree(d_recv_data); return 0; }
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess, int enable) { /* This will perform device bounds checking for us */ int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess); if (enable != prevEnabled) { /* If we're attempting to enable p2p access but p2p access isn't */ /* supported, throw an error */ if (enable) { int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); if (!access) { THError("p2p access not supported for %d accessing %d", dev, devToAccess); } } state->p2pAccessEnabled[dev][devToAccess] = enable; int prevDev = 0; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaSetDevice(dev)); /* This should be in sync with the current access state */ if (enable) { THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0)); } else { THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess)); } THCudaCheck(cudaSetDevice(prevDev)); } }
void THCudaInit(THCudaState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCudaRNGState*)malloc(sizeof(THCudaRNGState)); THCRandom_init(state->rngState, count, device); THCudaBlas_init(count, device); int i,j; for(i=0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); for (j=0; j < count; ++j) { if(i != j) { int can = 0; THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j)); if(can) THCudaCheck(cudaDeviceEnablePeerAccess(j, 0)); } } } THCudaCheck(cudaSetDevice(device)); }
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess) { if (dev < 0 || dev >= state->numDevices) { THError("%d is not a device", dev); } if (devToAccess < 0 || devToAccess >= state->numDevices) { THError("%d is not a device", devToAccess); } if (state->p2pAccessEnabled[dev][devToAccess] == -1) { int prevDev = 0; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaSetDevice(dev)); int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); if (access) { cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // ignore and clear the error if access was already enabled cudaGetLastError(); } else { THCudaCheck(err); } state->p2pAccessEnabled[dev][devToAccess] = 1; } else { state->p2pAccessEnabled[dev][devToAccess] = 0; } THCudaCheck(cudaSetDevice(prevDev)); } return state->p2pAccessEnabled[dev][devToAccess]; }
void THCudaEnablePeerToPeerAccess(THCState* state) { /* By default, all direct p2p kernel access (besides copy) is disallowed, */ /* since direct access without knowing whether or not a certain operation */ /* should be cross-GPU leads to synchronization errors. The user can choose */ /* to disable this functionality, however. */ state->p2pKernelAccessEnabled = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); int numDevices = -1; THCudaCheck(cudaGetDeviceCount(&numDevices)); state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices); for (int i = 0; i < numDevices; ++i) { state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices); } /* Build a table of all allowed p2p accesses, to avoid checking the p2p status at runtime. */ for (int i = 0; i < numDevices; ++i) { THCudaCheck(cudaSetDevice(i)); for (int j = 0; j < numDevices; ++j) { /* Presume no access by default */ state->p2pAccessEnabled[i][j] = 0; if (i == j) { /* A GPU can access itself */ state->p2pAccessEnabled[i][j] = 1; } else { int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j)); if (access) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { /* Any future call to cudaGetLastError will now return an error, */ /* even though we've already dealt with this specific error here. */ /* Call cudaGetLastError once to reset the last error state. */ cudaGetLastError(); continue; } /* In case there are unknown errors returned from the above */ THCudaCheck(err); /* Access could be enabled */ state->p2pAccessEnabled[i][j] = 1; } } } } /* Restore previous device before continuing */ THCudaCheck(cudaSetDevice(prevDev)); }
TEST(PeerAccess, EnableDisable) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); if (devices <= 1) { return; } int version; ret = cudaRuntimeGetVersion(&version); ASSERT_EQ(cudaSuccess, ret); typedef std::pair<int, int> peer_t; std::vector<peer_t> peers; for (int i = 0; i < devices; i++) { ret = cudaSetDevice(i); ASSERT_EQ(cudaSuccess, ret); for (int j = 0; j < devices; j++) { int peer; ret = cudaDeviceCanAccessPeer(&peer, i, j); ASSERT_EQ(cudaSuccess, ret); cudaError_t expected; if (peer) { expected = cudaSuccess; peers.push_back(peer_t(i, j)); #if CUDA_VERSION >= 5000 } else if (version >= 5000 /* 5.0 */) { expected = cudaErrorPeerAccessUnsupported; #endif } else { expected = cudaErrorInvalidDevice; } ret = cudaDeviceEnablePeerAccess(j, 0); EXPECT_EQ(expected, ret); } } /* Cleanup. */ const size_t n_peers = peers.size(); for (size_t i = 0; i < n_peers; i++) { ret = cudaSetDevice(peers[i].first); ASSERT_EQ(cudaSuccess, ret); ret = cudaDeviceDisablePeerAccess(peers[i].second); EXPECT_EQ(cudaSuccess, ret); } }
void THCudaEnablePeerToPeerAccess(THCState* state) { int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); int numDevices = -1; THCudaCheck(cudaGetDeviceCount(&numDevices)); state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices); for (int i = 0; i < numDevices; ++i) { state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices); } /* Build a table of all allowed p2p accesses, to avoid checking the p2p status at runtime. */ for (int i = 0; i < numDevices; ++i) { THCudaCheck(cudaSetDevice(i)); for (int j = 0; j < numDevices; ++j) { /* Presume no access by default */ state->p2pAccessEnabled[i][j] = 0; if (i == j) { /* A GPU can access itself */ state->p2pAccessEnabled[i][j] = 1; } else { int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j)); if (access) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { /* Any future call to cudaGetLastError will now return an error, */ /* even though we've already dealt with this specific error here. */ /* Call cudaGetLastError once to reset the last error state. */ cudaGetLastError(); continue; } /* In case there are unknown errors returned from the above */ THCudaCheck(err); /* Access could be enabled */ state->p2pAccessEnabled[i][j] = 1; } } } } /* Restore previous device before continuing */ THCudaCheck(cudaSetDevice(prevDev)); }
void cuda_p2p_table(int n, bool table[n][n]) { assert(n == cuda_devices()); for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { int r; CUDA_ERROR(cudaDeviceCanAccessPeer(&r, i, j)); table[i][j] = (1 == r); } } }
TEST(PeerAccess, CanAccessInvalidDevice) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); int peer; ret = cudaDeviceCanAccessPeer(&peer, device, devices); EXPECT_EQ(cudaErrorInvalidDevice, ret); }
P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver, P2PSync<Dtype>* parent, const SolverParameter& param) : GPUParams<Dtype>(root_solver, param.device_id()), parent_(parent), children_(), queue_(), initial_iter_(root_solver->iter()), solver_() { #ifndef CPU_ONLY int initial_device; CUDA_CHECK(cudaGetDevice(&initial_device)); const int self = param.device_id(); CUDA_CHECK(cudaSetDevice(self)); if (parent == NULL) { solver_ = root_solver; } else { Caffe::set_root_solver(false); solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get())); Caffe::set_root_solver(true); } this->configure(solver_.get()); solver_->add_callback(this); if (parent) { // Enable p2p access between devices const int peer = parent->solver_->param().device_id(); int access; CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); if (access) { CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0)); } else { LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer; } // Allocate receiving buffer on parent CUDA_CHECK(cudaSetDevice(peer)); CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype))); CUDA_CHECK(cudaSetDevice(self)); } CUDA_CHECK(cudaSetDevice(initial_device)); #else NO_GPU; #endif }
TEST(PeerAccess, DeviceReset) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); if (devices <= 1) { return; } bool found = false; int dj; for (int i = 0; i < devices && !(found); i++) { ret = cudaSetDevice(i); ASSERT_EQ(cudaSuccess, ret); for (int j = 0; j < devices; j++) { int peer; ret = cudaDeviceCanAccessPeer(&peer, i, j); ASSERT_EQ(cudaSuccess, ret); if (peer) { ret = cudaDeviceEnablePeerAccess(j, 0); ASSERT_EQ(cudaSuccess, ret); found = true; dj = j; break; } } } if (!(found)) { return; } /* Perform a device reset. */ ret = cudaDeviceReset(); EXPECT_EQ(cudaSuccess, ret); ret = cudaDeviceDisablePeerAccess(dj); EXPECT_EQ(cudaErrorPeerAccessNotEnabled, ret); }
TEST(PeerAccess, CanAccess) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); for (int i = 0; i < devices; i++) { for (int j = 0; j < devices; j++) { int peer; ret = cudaDeviceCanAccessPeer(&peer, i, j); EXPECT_EQ(cudaSuccess, ret); if (i == j) { EXPECT_FALSE(peer); } } } }
P2PSync<Dtype>::~P2PSync() { #ifndef CPU_ONLY int initial_device; CUDA_CHECK(cudaGetDevice(&initial_device)); const int self = solver_->param().device_id(); CUDA_CHECK(cudaSetDevice(self)); if (parent_) { CUDA_CHECK(cudaFree(parent_grads_)); const int peer = parent_->solver_->param().device_id(); int access; CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); if (access) { CUDA_CHECK(cudaDeviceDisablePeerAccess(peer)); } } CUDA_CHECK(cudaSetDevice(initial_device)); #endif }
static int can_use_fastpath(lua_State *L, int sock, uint32_t bind_addr, uint32_t addr) { #if defined(USE_CUDA) && !defined(__APPLE__) if (bind_addr == addr) { int device; THCudaCheck(cudaGetDevice(&device)); int ret = send(sock, &device, sizeof(device), 0); if (ret < 0) { close(sock); return LUA_HANDLE_ERROR(L, errno); } int remote_device; ret = recv(sock, &remote_device, sizeof(remote_device), 0); if (ret <= 0) { close(sock); return LUA_HANDLE_ERROR(L, errno); } if (device != remote_device) { int can; THCudaCheck(cudaDeviceCanAccessPeer(&can, device, remote_device)); if (can) { cudaError_t err = cudaDeviceEnablePeerAccess(remote_device, 0); if (err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled) { if (err == cudaErrorPeerAccessAlreadyEnabled) cudaGetLastError(); fprintf(stderr, "INFO: torch-ipc: CUDA IPC enabled between GPU%d and GPU%d\n", device, remote_device); return 1; } else { fprintf(stderr, "WARN: torch-ipc: CUDA IPC disabled between GPU%d and GPU%d: %s\n", device, remote_device, cudaGetErrorString(err)); } } else { fprintf(stderr, "INFO: torch-ipc: CUDA IPC not possible between GPU%d and GPU%d\n", device, remote_device); } } } #else (void)L; (void)sock; (void)bind_addr; (void)addr; #endif return 0; }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState)); THCudaBlas_init(state, count, device); int i,j; for(i=0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); for (j=0; j < count; ++j) { if(i != j) { int can = 0; THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j)); if(can) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // Any future call to cudaGetLastError will now return an error, // even though we've already dealt with this specific error here. // Call cudaGetLastError once to reset the last error state. cudaGetLastError(); continue; } THCudaCheck(err); } } } } THCudaCheck(cudaSetDevice(device)); }
void THCudaEnablePeerToPeerAccess(THCState* state) { int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); int numDevices = -1; THCudaCheck(cudaGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; ++i) { THCudaCheck(cudaSetDevice(i)); for (int j = 0; j < numDevices; ++j) { if (i != j) { int can = 0; THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j)); if (can) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // Any future call to cudaGetLastError will now return an error, // even though we've already dealt with this specific error here. // Call cudaGetLastError once to reset the last error state. cudaGetLastError(); continue; } THCudaCheck(err); } } } } /* Restore previous device before continuing */ THCudaCheck(cudaSetDevice(prevDev)); }
magma_int_t magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){ // Number of GPUs printf("Checking for multiple GPUs...\n"); int gpu_n; (cudaGetDeviceCount(&gpu_n)); printf("CUDA-capable device count: %i\n", gpu_n); if (gpu_n < 2) { printf("Two or more Tesla(s) with (SM 2.0)" " class GPUs are required for P2P.\n"); } // Query device properties cudaDeviceProp prop[64]; int gpuid_tesla[64]; // find the first two GPU's that can support P2P int gpu_count = 0; // GPUs that meet the criteria for (int i=0; i < gpu_n; i++) { (cudaGetDeviceProperties(&prop[i], i)); // Only Tesla boards based on Fermi can support P2P { // This is an array of P2P capable GPUs gpuid_tesla[gpu_count++] = i; } } *num_gpus=gpu_n; for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check possibility for peer access printf("\nChecking GPU(s) for support of peer to peer memory access...\n"); int can_access_peer_0_1, can_access_peer_1_0; // In this case we just pick the first two that we can support (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], gpuid_tesla[j])); (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], gpuid_tesla[i])); // Output results from P2P capabilities printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], prop[gpuid_tesla[j]].name, gpuid_tesla[j] , can_access_peer_0_1 ? "Yes" : "No"); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], prop[gpuid_tesla[i]].name, gpuid_tesla[i], can_access_peer_1_0 ? "Yes" : "No"); if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0) { printf("Two or more Tesla(s) with class" " GPUs are required for P2P to run.\n"); printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n"); printf("Peer to Peer access is not available between" " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]); printf("PASSED\n"); //exit(EXIT_SUCCESS); } } } // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0)); (cudaSetDevice(gpuid_tesla[j])); (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0)); magma_dcheckerr("P2P"); } } magma_dcheckerr("P2P successful"); // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check that we got UVA on both devices printf("Checking GPU%d and GPU%d for UVA capabilities...\n", gpuid_tesla[i], gpuid_tesla[j]); //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && // prop[gpuid_tesla[j]].unifiedAddressing); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") ); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") ); } } if(*bw_bmark==1){ // P2P memcopy() benchmark for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Allocate buffers const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); float* g0; (cudaMalloc(&g0, buf_size)); (cudaSetDevice(gpuid_tesla[j])); float* g1; (cudaMalloc(&g1, buf_size)); float* h0; (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA // Create CUDA event handles printf("Creating event handles...\n"); cudaEvent_t start_event, stop_event; float time_memcpy; int eventflags = cudaEventBlockingSync; (cudaEventCreateWithFlags(&start_event, eventflags)); (cudaEventCreateWithFlags(&stop_event, eventflags)); (cudaEventRecord(start_event, 0)); for (int k=0; k<100; k++) { // With UVA we don't need to specify source and target devices, the // runtime figures this out by itself from the pointers // Ping-pong copy between GPUs if (k % 2 == 0) (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); else (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); } (cudaEventRecord(stop_event, 0)); (cudaEventSynchronize(stop_event)); (cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); printf("cudaMemcpyPeer / cudaMemcpy between" "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j], (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f); // Cleanup and shutdown printf("Cleanup of P2P benchmark...\n"); (cudaEventDestroy(start_event)); (cudaEventDestroy(stop_event)); (cudaSetDevice(gpuid_tesla[i])); (magma_free( g0) ); (cudaSetDevice(gpuid_tesla[j])); (magma_free( g1) ); (magma_free_cpu( h0) ); } } // host-device memcopy() benchmark for(int j=0; j<gpu_n; j++) { cudaSetDevice(gpuid_tesla[j]); int *h_data_source; int *h_data_sink; int *h_data_in[STREAM_COUNT]; int *d_data_in[STREAM_COUNT]; int *h_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT]; cudaEvent_t cycleDone[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT]; cudaEvent_t start, stop; // Allocate resources int memsize; memsize = 1000000 * sizeof(int); h_data_source = (int*) malloc(memsize); h_data_sink = (int*) malloc(memsize); for( int i =0; i<STREAM_COUNT; ++i ) { ( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_in[i], memsize) ); ( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_out[i], memsize) ); ( cudaStreamCreate(&stream[i]) ); ( cudaEventCreate(&cycleDone[i]) ); cudaEventRecord(cycleDone[i], stream[i]); } cudaEventCreate(&start); cudaEventCreate(&stop); // Time host-device copies cudaEventRecord(start,0); ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice,0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_h2d_time; cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventRecord(start,0); ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_d2h_time; cudaEventElapsedTime(&memcpy_d2h_time, start, stop); cudaEventSynchronize(stop); printf("Measured timings (throughput):\n"); printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j, memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time ); printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j, memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time); // Free resources free( h_data_source ); free( h_data_sink ); for( int i =0; i<STREAM_COUNT; ++i ) { magma_free_cpu( h_data_in[i] ); magma_free( d_data_in[i] ); magma_free_cpu( h_data_out[i] ); magma_free( d_data_out[i] ); cudaStreamDestroy(stream[i]); cudaEventDestroy(cycleDone[i]); } cudaEventDestroy(start); cudaEventDestroy(stop); } }//end if-loop bandwidth_benchmark magma_dcheckerr("P2P established"); return MAGMA_SUCCESS; }
void mem_control_kernel_float(float *starting_point_A, float **A_dev, LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim, int *mem_cpy_counter, reader_tracker *addr_track, cudaStream_t *stream, int nrowa_dev, int ncola_dev, int lda) { rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)); if( block_A == NULL ) { //new element //fprintf(stderr, "==========new element========\n"); //traverse_LRU_se(LRU); int search_l_GPU = GPU_id-1; int search_r_GPU = GPU_id+1; rbt_node *block_A_l = NULL; rbt_node *block_A_r = NULL; while (block_A_l == NULL && block_A_r == NULL) { if (search_l_GPU >= 0) { block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map)); if (block_A_l != NULL) { if (block_A_l->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU); if(peer_access_check == 1) block_A_l = NULL; } } search_l_GPU--; } if (search_r_GPU < GPUs) { block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map)); if (block_A_r != NULL) { if (block_A_r->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU); if(peer_access_check == 1) block_A_r = NULL; } } search_r_GPU++; } if (search_l_GPU < 0 && search_r_GPU >= GPUs) { break; } } //rectitfication search_l_GPU++; search_r_GPU--; assert(search_l_GPU >= 0 && search_l_GPU < GPUs); assert(search_r_GPU >= 0 && search_r_GPU < GPUs); if ( !(block_A_l == NULL && block_A_r == NULL) ) { //inter GPU communication int target_GPU_id = 0; if (block_A_l != NULL && block_A_r != NULL) { if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_r_GPU; block_A = block_A_r; } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { int rand_select = rand()%10; if (rand_select < 5) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { target_GPU_id = search_r_GPU; block_A = block_A_r; } } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //fprintf(stderr, "==>3 block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r); } else { if (block_A_l != NULL && block_A_r == NULL) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else if(block_A_r != NULL && block_A_l == NULL) { target_GPU_id = search_r_GPU; block_A = block_A_r; } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //printf("==>2 block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done); } if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL) goto new_block; atomic_reader_plus(block_A); *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1); assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess ); //cannot dequeue the GPU mem at the target GPU addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = target_GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 1; (*mem_cpy_counter) += 1; //cannnot dequeue the current new GPU mem addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } else { new_block: //(block_A_r == NULL && block_A_l == NULL) { //bring new blocks //printf("==>1 bring new block to GPU:%d\n", GPU_id); (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } } else { atomic_reader_plus(block_A); assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL); *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; (*mem_cpy_counter) += 1; } }
magma_int_t magma_buildconnection_mgpu( magma_int_t gnode[MagmaMaxGPUs+2][MagmaMaxGPUs+2], magma_int_t *nbcmplx, magma_int_t ngpu) { magma_int_t *deviceid = (magma_int_t *) malloc(ngpu*sizeof(magma_int_t)); memset(deviceid,0,ngpu*sizeof(magma_int_t)); nbcmplx[0] =0; //printf(" Initializing....\n\n"); //printf(" This machine has %d GPU\n",ngpu); //printf(" cudaSuccess %d, cudaErrorInvalidDevice %d, cudaErrorPeerAccessAlreadyEnabled %d, cudaErrorInvalidValue %d \n", cudaSuccess, cudaErrorInvalidDevice,cudaErrorPeerAccessAlreadyEnabled, cudaErrorInvalidValue ); int samecomplex=-1; cudaError_t err,scerr; cudaDeviceProp prop; magma_int_t cmplxnb = 0; magma_int_t cmplxid = 0; magma_int_t lcgpunb = 0; for( magma_int_t d = 0; d < ngpu; ++d ) { // check for unified memory & enable peer memory access between all GPUs. magma_setdevice( d ); cudaGetDeviceProperties( &prop, d ); if ( ! prop.unifiedAddressing ) { printf( "device %d doesn't support unified addressing\n", (int) d ); return -1; } // add this device to the list if not added yet. // not added yet meaning belong to a new complex if(deviceid[d]==0){ cmplxnb = cmplxnb+1; cmplxid = cmplxnb-1; gnode[cmplxid][MagmaMaxGPUs] = 1; lcgpunb = gnode[cmplxid][MagmaMaxGPUs]-1; gnode[cmplxid][lcgpunb] = d; deviceid[d]=-1; } //printf("DEVICE %d : \n",d); for( magma_int_t d2 = d+1; d2 < ngpu; ++d2 ) { // check for unified memory & enable peer memory access between all GPUs. magma_setdevice( d2 ); cudaGetDeviceProperties( &prop, d2 ); if ( ! prop.unifiedAddressing ) { printf( "device %d doesn't support unified addressing\n", (int) d2 ); return -1; } scerr = cudaDeviceCanAccessPeer(&samecomplex,d,d2); //printf(" device %d and device %d have samecomplex= %d\n",d,d2,samecomplex); if(samecomplex==1){ // d and d2 are on the same complex so add them, note that d is already added // so just enable the peer Access for d and enable+add d2. // FOR d: magma_setdevice( d ); err = cudaDeviceEnablePeerAccess( d2, 0 ); //printf("enabling devide %d ==> %d error %d\n",d,d2,err); if ( err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled ) { printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d2, (int) err ); return -2; } // FOR d2: magma_setdevice( d2 ); err = cudaDeviceEnablePeerAccess( d, 0 ); //printf("enabling devide %d ==> %d error %d\n",d2,d,err); if((err==cudaSuccess)||(err==cudaErrorPeerAccessAlreadyEnabled)){ if(deviceid[d2]==0){ //printf("adding device %d\n",d2); gnode[cmplxid][MagmaMaxGPUs] = gnode[cmplxid][MagmaMaxGPUs]+1; lcgpunb = gnode[cmplxid][MagmaMaxGPUs]-1; gnode[cmplxid][lcgpunb] = d2; deviceid[d2]=-1; } }else{ printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d, (int) err ); return -2; } } } } nbcmplx[0] = cmplxnb; return cmplxnb; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported if (deviceCount >= 2) { cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPUs that can support P2P int gpu_p2p_count = 0; for (int i=0; i < deviceCount; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi or later can support P2P if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } } // Show all the combinations of support P2P GPUs int can_access_peer; if (gpu_p2p_count >= 2) { for (int i = 0; i < gpu_p2p_count; i++) { for (int j = 0; j < gpu_p2p_count; j++) { if (gpuid[i] == gpuid[j]) { continue; } checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j] , can_access_peer ? "Yes" : "No"); } } } } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); printf("Result = PASS\n"); // finish exit(EXIT_SUCCESS); }
void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) { #ifndef CPU_ONLY vector<int> remaining(devices); // Depth for reduction tree int remaining_depth = static_cast<int>(ceil(log2(remaining.size()))); // Group GPUs by board for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { cudaDeviceProp a, b; CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i])); CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j])); if (a.isMultiGpuBoard && b.isMultiGpuBoard) { if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } } ostringstream s; for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str(); // Group by P2P accessibility remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { int access; CUDA_CHECK( cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j])); if (access) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } s.str(""); for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str(); // Group remaining remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { pairs->push_back(DevicePair(remaining[i], remaining[i + 1])); DLOG(INFO) << "Remaining pair: " << remaining[i] << ":" << remaining[i + 1]; remaining.erase(remaining.begin() + i + 1); } } // Should only be the parent node remaining CHECK_EQ(remaining.size(), 1); pairs->insert(pairs->begin(), DevicePair(-1, remaining[0])); CHECK(pairs->size() == devices.size()); for (int i = 0; i < pairs->size(); ++i) { CHECK((*pairs)[i].parent() != (*pairs)[i].device()); for (int j = i + 1; j < pairs->size(); ++j) { CHECK((*pairs)[i].device() != (*pairs)[j].device()); } } #else NO_GPU; #endif }