Beispiel #1
0
void Caffe::ConnectMasterSlaveDevice(const int master_device_id, const int slave_device_id){


	int can_access;
	int target_device_id;
	target_device_id = slave_device_id;	

	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, master_device_id, slave_device_id));

	if (can_access == 0){
		LOG(WARNING)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<" can not be enabled. Data transfering may be slow.";
	}else{
		CUDA_CHECK(cudaSetDevice(master_device_id));
		CUDA_CHECK(cudaDeviceEnablePeerAccess(target_device_id, 0));
		LOG(INFO)<<"Device P2P access from GPU "<<master_device_id<<" to GPU "<<slave_device_id<<"  enabled.";		
	}

	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, slave_device_id, master_device_id ));

	if (can_access == 0){
		LOG(WARNING)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" can not be enabled. Data transfering may be slow.";
	}else{
		CUDA_CHECK(cudaSetDevice(slave_device_id));
		CUDA_CHECK(cudaDeviceEnablePeerAccess(master_device_id, 0));
		LOG(INFO)<<"Device P2P access from GPU "<<slave_device_id<<" to GPU "<<master_device_id<<" enabled.";
	}
	CUDA_CHECK(cudaSetDevice(master_device_id));
}
int main(int argc, char **argv)
{
	// define the ptr
	int size = WIDTH * HEIGHT;
	float *h_data, *d_send_data, *d_recv_data; 
	bool use_cuda_time = true;

	if(argc != 3) {
		std::cout << "the number of paramter should be equal 2" << std::endl;
		std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl;
		return 1;
	}
	//std::cout << "debug 1" << std::endl;
	int id0 = atoi(argv[1]);
	int id1 = atoi(argv[2]);
	std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl;

	//h_data = new float[size];
	cudaMallocHost(&h_data, size*sizeof(float));
	init_data(h_data, size);

	cudaSetDevice(id0);
	cudaMalloc(&d_send_data, size*sizeof(float));
	cudaSetDevice(id1);
	cudaMalloc(&d_recv_data, size*sizeof(float));
	cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice);

	int can_access_peer_0_1, can_access_peer_1_0;
	cudaSetDevice(id0);
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1));
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0));

	if(can_access_peer_0_1 && can_access_peer_1_0) {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl;
		cudaSetDevice(id0);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0));
		cudaSetDevice(id1);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0));
	} else {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl;
	}

	cudaSetDevice(id1);
	use_cuda_time = false;
	//use_cuda_time = true;
	test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time);

	cudaFreeHost(h_data);
	cudaFree(d_send_data);
	cudaFree(d_recv_data);

	return 0;
}
TEST(PeerAccess, EnableAccessInvalidDevice) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaDeviceEnablePeerAccess(devices, 0);
    EXPECT_EQ(cudaErrorInvalidDevice, ret);

    ret = cudaDeviceEnablePeerAccess(-1, 0);
    EXPECT_EQ(cudaErrorInvalidDevice, ret);
}
Beispiel #4
0
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                  int enable)
{
  /* This will perform device bounds checking for us */
  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

  if (enable != prevEnabled) {
    /* If we're attempting to enable p2p access but p2p access isn't */
    /* supported, throw an error */
    if (enable) {
      int access = 0;
      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

      if (!access) {
        THError("p2p access not supported for %d accessing %d",
                dev, devToAccess);
      }
    }

    state->p2pAccessEnabled[dev][devToAccess] = enable;

    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    /* This should be in sync with the current access state */
    if (enable) {
      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
    } else {
      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
}
Beispiel #5
0
void THCudaInit(THCudaState* state)
{
  int count = 0;
  THCudaCheck(cudaGetDeviceCount(&count));

  int device = 0;
  THCudaCheck(cudaGetDevice(&device));

  state->rngState = (THCudaRNGState*)malloc(sizeof(THCudaRNGState));
  THCRandom_init(state->rngState, count, device);

  THCudaBlas_init(count, device);

  int i,j;
  for(i=0; i < count; ++i)
  {
    THCudaCheck(cudaSetDevice(i));
    for (j=0; j < count; ++j)
    {
      if(i != j)
      {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));
        if(can)
          THCudaCheck(cudaDeviceEnablePeerAccess(j, 0));
      }
    }
  }
  THCudaCheck(cudaSetDevice(device));
}
Beispiel #6
0
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
  if (dev < 0 || dev >= state->numDevices) {
    THError("%d is not a device", dev);
  }
  if (devToAccess < 0 || devToAccess >= state->numDevices) {
    THError("%d is not a device", devToAccess);
  }
  if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    int access = 0;
    THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
    if (access) {
      cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
      if (err == cudaErrorPeerAccessAlreadyEnabled) {
        // ignore and clear the error if access was already enabled
        cudaGetLastError();
      } else {
        THCudaCheck(err);
      }
      state->p2pAccessEnabled[dev][devToAccess] = 1;
    } else {
      state->p2pAccessEnabled[dev][devToAccess] = 0;
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
  return state->p2pAccessEnabled[dev][devToAccess];
}
Beispiel #7
0
int main(int argc, char **argv)
{
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    
    printf("Inter-GPU bi-directional memory exhange test\n");
    
    int ndevs = 0;
    CUDA_CHECK(cudaGetDeviceCount(&ndevs));

    if (FLAGS_from >= ndevs)
    {
        printf("Invalid --from flag. Only %d GPUs are available.\n", ndevs);
        return 1;
    }
    if (FLAGS_to >= ndevs)
    {
        printf("Invalid --to flag. Only %d GPUs are available.\n", ndevs);
        return 2;
    }
    
    printf("Enabling peer-to-peer access\n");
    
    // Enable peer-to-peer access       
    for(int i = 0; i < ndevs; ++i)
    {
        CUDA_CHECK(cudaSetDevice(i));
        for(int j = 0; j < ndevs; ++j)
            if (i != j)
                cudaDeviceEnablePeerAccess(j, 0);
    } 

    printf("GPUs: %d\n", ndevs);
    printf("Data size: %.2f MB\n", (FLAGS_size / 1024.0f / 1024.0f));
    printf("Repetitions: %d\n", (int)FLAGS_repetitions);
    printf("\n");
    
    for(int i = 0; i < ndevs; ++i)
    {
        // Skip source GPUs
        if(FLAGS_from >= 0 && i != FLAGS_from)
            continue;
        
        for(int j = i; j < ndevs; ++j)
        {
            // Skip self-copies
            if(i == j)
                continue;
            // Skip target GPUs
            if(FLAGS_to >= 0 && j != FLAGS_to)
                continue;

            printf("Exchanging between GPU %d and GPU %d: ", i, j);

            CopySegment(i, j);
        }
    }

    return 0;
}
Beispiel #8
0
void cuda_p2p(int a, int b)
{
	int dev;
	CUDA_ERROR(cudaGetDevice(&dev));
	CUDA_ERROR(cudaSetDevice(a));
	CUDA_ERROR(cudaDeviceEnablePeerAccess(b, 0));
	CUDA_ERROR(cudaSetDevice(dev));
}
Beispiel #9
0
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  /* By default, all direct p2p kernel access (besides copy) is disallowed, */
  /* since direct access without knowing whether or not a certain operation */
  /* should be cross-GPU leads to synchronization errors. The user can choose */
  /* to disable this functionality, however. */
  state->p2pKernelAccessEnabled = 0;

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
  for (int i = 0; i < numDevices; ++i) {
    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
  }

  /* Build a table of all allowed p2p accesses, to avoid checking the p2p
     status at runtime. */
  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      /* Presume no access by default */
      state->p2pAccessEnabled[i][j] = 0;

      if (i == j) {
        /* A GPU can access itself */
        state->p2pAccessEnabled[i][j] = 1;
      } else {
        int access = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

        if (access) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            /* Any future call to cudaGetLastError will now return an error, */
            /* even though we've already dealt with this specific error here. */
            /* Call cudaGetLastError once to reset the last error state. */
            cudaGetLastError();
            continue;
          }

          /* In case there are unknown errors returned from the above */
          THCudaCheck(err);

          /* Access could be enabled */
          state->p2pAccessEnabled[i][j] = 1;
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
Beispiel #10
0
TEST(PeerAccess, EnableDisable) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    if (devices <= 1) {
        return;
    }

    int version;
    ret = cudaRuntimeGetVersion(&version);
    ASSERT_EQ(cudaSuccess, ret);

    typedef std::pair<int, int> peer_t;
    std::vector<peer_t> peers;

    for (int i = 0; i < devices; i++) {
        ret = cudaSetDevice(i);
        ASSERT_EQ(cudaSuccess, ret);

        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            ASSERT_EQ(cudaSuccess, ret);

            cudaError_t expected;

            if (peer) {
                expected = cudaSuccess;
                peers.push_back(peer_t(i, j));
            #if CUDA_VERSION >= 5000
            } else if (version >= 5000 /* 5.0 */) {
                expected = cudaErrorPeerAccessUnsupported;
            #endif
            } else {
                expected = cudaErrorInvalidDevice;
            }

            ret = cudaDeviceEnablePeerAccess(j, 0);
            EXPECT_EQ(expected, ret);
        }
    }

    /* Cleanup. */
    const size_t n_peers = peers.size();

    for (size_t i = 0; i < n_peers; i++) {
        ret = cudaSetDevice(peers[i].first);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaDeviceDisablePeerAccess(peers[i].second);
        EXPECT_EQ(cudaSuccess, ret);
    }
}
Beispiel #11
0
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
  for (int i = 0; i < numDevices; ++i) {
    state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
  }

  /* Build a table of all allowed p2p accesses, to avoid checking the p2p
     status at runtime. */

  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      /* Presume no access by default */
      state->p2pAccessEnabled[i][j] = 0;

      if (i == j) {
        /* A GPU can access itself */
        state->p2pAccessEnabled[i][j] = 1;
      } else {
        int access = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

        if (access) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            /* Any future call to cudaGetLastError will now return an error, */
            /* even though we've already dealt with this specific error here. */
            /* Call cudaGetLastError once to reset the last error state. */
            cudaGetLastError();
            continue;
          }

          /* In case there are unknown errors returned from the above */
          THCudaCheck(err);

          /* Access could be enabled */
          state->p2pAccessEnabled[i][j] = 1;
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
Beispiel #12
0
P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
                        P2PSync<Dtype>* parent, const SolverParameter& param)
    : GPUParams<Dtype>(root_solver, param.device_id()),
      parent_(parent),
      children_(),
      queue_(),
      initial_iter_(root_solver->iter()),
      solver_() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = param.device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent == NULL) {
        solver_ = root_solver;
    } else {
        Caffe::set_root_solver(false);
        solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
        Caffe::set_root_solver(true);
    }
    this->configure(solver_.get());
    solver_->add_callback(this);

    if (parent) {
        // Enable p2p access between devices
        const int peer = parent->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
        } else {
            LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
        }
        // Allocate receiving buffer on parent
        CUDA_CHECK(cudaSetDevice(peer));
        CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
        CUDA_CHECK(cudaSetDevice(self));
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#else
    NO_GPU;
#endif
}
Beispiel #13
0
TEST(PeerAccess, DeviceReset) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    if (devices <= 1) {
        return;
    }

    bool found = false;
    int dj;

    for (int i = 0; i < devices && !(found); i++) {
        ret = cudaSetDevice(i);
        ASSERT_EQ(cudaSuccess, ret);

        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            ASSERT_EQ(cudaSuccess, ret);

            if (peer) {
                ret = cudaDeviceEnablePeerAccess(j, 0);
                ASSERT_EQ(cudaSuccess, ret);

                found = true;
                dj = j;
                break;
            }
        }
    }

    if (!(found)) {
        return;
    }

    /* Perform a device reset. */
    ret = cudaDeviceReset();
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaDeviceDisablePeerAccess(dj);
    EXPECT_EQ(cudaErrorPeerAccessNotEnabled, ret);
}
Beispiel #14
0
static int can_use_fastpath(lua_State *L, int sock, uint32_t bind_addr, uint32_t addr) {
#if defined(USE_CUDA) && !defined(__APPLE__)
   if (bind_addr == addr) {
      int device;
      THCudaCheck(cudaGetDevice(&device));
      int ret = send(sock, &device, sizeof(device), 0);
      if (ret < 0) {
         close(sock);
         return LUA_HANDLE_ERROR(L, errno);
      }
      int remote_device;
      ret = recv(sock, &remote_device, sizeof(remote_device), 0);
      if (ret <= 0) {
         close(sock);
         return LUA_HANDLE_ERROR(L, errno);
      }
      if (device != remote_device) {
         int can;
         THCudaCheck(cudaDeviceCanAccessPeer(&can, device, remote_device));
         if (can) {
            cudaError_t err = cudaDeviceEnablePeerAccess(remote_device, 0);
            if (err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled) {
               if (err == cudaErrorPeerAccessAlreadyEnabled) cudaGetLastError();
               fprintf(stderr, "INFO: torch-ipc: CUDA IPC enabled between GPU%d and GPU%d\n", device, remote_device);
               return 1;
            } else {
               fprintf(stderr, "WARN: torch-ipc: CUDA IPC disabled between GPU%d and GPU%d: %s\n", device, remote_device, cudaGetErrorString(err));
            }
         } else {
            fprintf(stderr, "INFO: torch-ipc: CUDA IPC not possible between GPU%d and GPU%d\n", device, remote_device);
         }
      }
   }
#else
   (void)L;
   (void)sock;
   (void)bind_addr;
   (void)addr;
#endif
   return 0;
}
Beispiel #15
0
void THCudaInit(THCState* state)
{
  int count = 0;
  THCudaCheck(cudaGetDeviceCount(&count));

  int device = 0;
  THCudaCheck(cudaGetDevice(&device));

  state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
  THCRandom_init(state, count, device);

  state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState));
  THCudaBlas_init(state, count, device);

  int i,j;
  for(i=0; i < count; ++i)
  {
    THCudaCheck(cudaSetDevice(i));
    for (j=0; j < count; ++j)
    {
      if(i != j)
      {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));
        if(can) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            // Any future call to cudaGetLastError will now return an error,
            // even though we've already dealt with this specific error here.
            // Call cudaGetLastError once to reset the last error state.
            cudaGetLastError();

            continue;
          }
          THCudaCheck(err);
        }
      }
    }
  }
  THCudaCheck(cudaSetDevice(device));
}
Beispiel #16
0
void THCudaEnablePeerToPeerAccess(THCState* state)
{
  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  int numDevices = -1;
  THCudaCheck(cudaGetDeviceCount(&numDevices));

  for (int i = 0; i < numDevices; ++i) {
    THCudaCheck(cudaSetDevice(i));

    for (int j = 0; j < numDevices; ++j) {
      if (i != j) {
        int can = 0;
        THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j));

        if (can) {
          cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
          if (err == cudaErrorPeerAccessAlreadyEnabled) {
            // Any future call to cudaGetLastError will now return an error,
            // even though we've already dealt with this specific error here.
            // Call cudaGetLastError once to reset the last error state.
            cudaGetLastError();

            continue;
          }

          THCudaCheck(err);
        }
      }
    }
  }

  /* Restore previous device before continuing */
  THCudaCheck(cudaSetDevice(prevDev));
}
Beispiel #17
0
magma_int_t
magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){


    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
     (cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("Two or more Tesla(s) with (SM 2.0)"
                        " class GPUs are required for P2P.\n");
    }

    // Query device properties
    cudaDeviceProp prop[64];
    int gpuid_tesla[64]; // find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++) {
         (cudaGetDeviceProperties(&prop[i], i));
        // Only Tesla boards based on Fermi can support P2P
        {
            // This is an array of P2P capable GPUs
            gpuid_tesla[gpu_count++] = i;
        }
    }
    *num_gpus=gpu_n;

     for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
  // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
    int can_access_peer_0_1, can_access_peer_1_0;
    // In this case we just pick the first two that we can support
     (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], 
                                                        gpuid_tesla[j]));
     (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], 
                                                        gpuid_tesla[i]));


    // Output results from P2P capabilities
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],                                                                  
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j] ,
                             can_access_peer_0_1 ? "Yes" : "No");
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j],
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],
                            can_access_peer_1_0 ? "Yes" : "No");

    if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
    {
        printf("Two or more Tesla(s) with class"
                " GPUs are required for P2P to run.\n");
        printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between"
        " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]);
        printf("PASSED\n");
        //exit(EXIT_SUCCESS);
    }
     }    
    }

  // Enable peer access
     for(int i=0; i<gpu_n; i++)
     {
         for(int j=i+1; j<gpu_n; j++)
         {
             printf("Enabling peer access between GPU%d and GPU%d...\n",
                gpuid_tesla[i], gpuid_tesla[j]);
              (cudaSetDevice(gpuid_tesla[i]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0));
              (cudaSetDevice(gpuid_tesla[j]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0));
           magma_dcheckerr("P2P");
         }
     }

   magma_dcheckerr("P2P successful");


    // Enable peer access
    for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Check that we got UVA on both devices
    printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 
    gpuid_tesla[i], gpuid_tesla[j]);
    //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && 
    //                            prop[gpuid_tesla[j]].unifiedAddressing);

    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, 
    gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") );
    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, 
    gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") );

        }
    }



  if(*bw_bmark==1){


    // P2P memcopy() benchmark
   for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", 
                int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]);
    (cudaSetDevice(gpuid_tesla[i]));
    float* g0;
    (cudaMalloc(&g0, buf_size));
    (cudaSetDevice(gpuid_tesla[j]));
    float* g1;
    (cudaMalloc(&g1, buf_size));
    float* h0;
    (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float time_memcpy;
    int eventflags = cudaEventBlockingSync;
    (cudaEventCreateWithFlags(&start_event, eventflags));
    (cudaEventCreateWithFlags(&stop_event, eventflags));


    (cudaEventRecord(start_event, 0));
    for (int k=0; k<100; k++)
    {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
            
        // Ping-pong copy between GPUs
        if (k % 2 == 0)
            (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        else
            (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
    (cudaEventRecord(stop_event, 0));
    (cudaEventSynchronize(stop_event));
    (cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between"
            "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j],
        (1.0f / (time_memcpy / 1000.0f)) 
            * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);


     // Cleanup and shutdown
    printf("Cleanup of P2P benchmark...\n");
    (cudaEventDestroy(start_event));
    (cudaEventDestroy(stop_event));
    (cudaSetDevice(gpuid_tesla[i]));
    (magma_free( g0) );
    (cudaSetDevice(gpuid_tesla[j]));
    (magma_free( g1) );
    (magma_free_cpu( h0) );

    }
    }

    // host-device memcopy() benchmark

        for(int j=0; j<gpu_n; j++)
        {
    cudaSetDevice(gpuid_tesla[j]);

    int *h_data_source;
    int *h_data_sink;

    int *h_data_in[STREAM_COUNT];
    int *d_data_in[STREAM_COUNT];

    int *h_data_out[STREAM_COUNT];
    int *d_data_out[STREAM_COUNT];


    cudaEvent_t cycleDone[STREAM_COUNT];
    cudaStream_t stream[STREAM_COUNT];

    cudaEvent_t start, stop;

    // Allocate resources
    int memsize;
    memsize = 1000000 * sizeof(int);

    h_data_source = (int*) malloc(memsize);
    h_data_sink = (int*) malloc(memsize);    

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        ( cudaHostAlloc(&h_data_in[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_in[i], memsize) );
        
        ( cudaHostAlloc(&h_data_out[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_out[i], memsize) );

        
        ( cudaStreamCreate(&stream[i]) );
        ( cudaEventCreate(&cycleDone[i]) ); 
        
        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start); cudaEventCreate(&stop);

    

    // Time host-device copies
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, 
        cudaMemcpyHostToDevice,0) );
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_h2d_time;    
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, 
        cudaMemcpyDeviceToHost, 0) );        
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_d2h_time;    
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    
    cudaEventSynchronize(stop);
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j,
        memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time );
    printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j,
        memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);

    // Free resources

    free( h_data_source );
    free( h_data_sink );

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        magma_free_cpu( h_data_in[i] );
        magma_free( d_data_in[i] );

        magma_free_cpu( h_data_out[i] );
        magma_free( d_data_out[i] );
        
        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);        
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
   }

  }//end if-loop bandwidth_benchmark

    magma_dcheckerr("P2P established");

    return MAGMA_SUCCESS;

}
Beispiel #18
0
magma_int_t magma_buildconnection_mgpu(  magma_int_t gnode[MagmaMaxGPUs+2][MagmaMaxGPUs+2], magma_int_t *nbcmplx, magma_int_t ngpu)
{
    magma_int_t *deviceid = (magma_int_t *) malloc(ngpu*sizeof(magma_int_t));
    memset(deviceid,0,ngpu*sizeof(magma_int_t));

    nbcmplx[0] =0;


    //printf(" Initializing....\n\n");
    //printf(" This machine has %d GPU\n",ngpu);

    //printf(" cudaSuccess %d, cudaErrorInvalidDevice %d, cudaErrorPeerAccessAlreadyEnabled %d, cudaErrorInvalidValue %d \n",    cudaSuccess, cudaErrorInvalidDevice,cudaErrorPeerAccessAlreadyEnabled, cudaErrorInvalidValue );

    int samecomplex=-1;
    cudaError_t err,scerr;
    cudaDeviceProp prop;

    magma_int_t cmplxnb = 0;
    magma_int_t cmplxid = 0;
    magma_int_t lcgpunb = 0;
    for( magma_int_t d = 0; d < ngpu; ++d ) {
        // check for unified memory & enable peer memory access between all GPUs.            
        magma_setdevice( d );
        cudaGetDeviceProperties( &prop, d );
        if ( ! prop.unifiedAddressing ) {
            printf( "device %d doesn't support unified addressing\n", (int) d );
            return -1;
        }
        // add this device to the list if not added yet.
        // not added yet meaning belong to a new complex
        if(deviceid[d]==0){
            cmplxnb           = cmplxnb+1;
            cmplxid           = cmplxnb-1;
            gnode[cmplxid][MagmaMaxGPUs] = 1;
            lcgpunb           = gnode[cmplxid][MagmaMaxGPUs]-1;
            gnode[cmplxid][lcgpunb] = d;
            deviceid[d]=-1;
        }
        //printf("DEVICE %d : \n",d);

        for( magma_int_t d2 = d+1; d2 < ngpu; ++d2 ) {
            // check for unified memory & enable peer memory access between all GPUs.            
            magma_setdevice( d2 );
            cudaGetDeviceProperties( &prop, d2 );
            if ( ! prop.unifiedAddressing ) {
                printf( "device %d doesn't support unified addressing\n", (int) d2 );
                return -1;
            }

            scerr = cudaDeviceCanAccessPeer(&samecomplex,d,d2); 

            //printf(" device %d and device %d have samecomplex= %d\n",d,d2,samecomplex);
            if(samecomplex==1){
                // d and d2 are on the same complex so add them, note that d is already added
                // so just enable the peer Access for d and enable+add d2.
                // FOR d:
                magma_setdevice( d );
                err   = cudaDeviceEnablePeerAccess( d2, 0 );
                //printf("enabling devide %d ==> %d  error %d\n",d,d2,err);
                if ( err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled ) {
                    printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d2, (int) err );
                    return -2;
                }

                // FOR d2:
                magma_setdevice( d2 );
                err   = cudaDeviceEnablePeerAccess( d, 0 );
                //printf("enabling devide %d ==> %d  error %d\n",d2,d,err);
                if((err==cudaSuccess)||(err==cudaErrorPeerAccessAlreadyEnabled)){
                    if(deviceid[d2]==0){
                        //printf("adding device %d\n",d2);
                        gnode[cmplxid][MagmaMaxGPUs] = gnode[cmplxid][MagmaMaxGPUs]+1;
                        lcgpunb           = gnode[cmplxid][MagmaMaxGPUs]-1;
                        gnode[cmplxid][lcgpunb] = d2;
                        deviceid[d2]=-1;
                    }
                }else{
                    printf( "device %d cudaDeviceEnablePeerAccess error %d\n", (int) d, (int) err );
                    return -2;
                }
            }
        }
    }

    nbcmplx[0] = cmplxnb; 
    return cmplxnb;
}
Beispiel #19
0
int main(int argc, char **argv)
{
    char inputFile[] = "../test/data/example_input";
    char patternFile[] = "../test/pattern/example_pattern";
    PFAC_handle_t handle;
    PFAC_status_t PFAC_status;
    int input_size;
    char *h_input_string = NULL;
    int  *h_matched_result = NULL;

    char *d_input_string;
    int *d_matched_result;
    cudaError_t cuda_status; 

    // step 1: create PFAC handle and bind PFAC context to GPU 0 
    cuda_status = cudaSetDevice(0);
    assert( cudaSuccess == cuda_status );

    PFAC_status = PFAC_create( &handle ); 
    assert( PFAC_STATUS_SUCCESS == PFAC_status );

    PFAC_status = PFAC_readPatternFromFile(handle, patternFile);
    assert ( PFAC_STATUS_SUCCESS == PFAC_status );
  
    // prepare input stream h_input_string in host memory and
    // allocate h_matched_result to contain the matched results
    FILE* fpin = fopen( inputFile, "rb");
    assert ( NULL != fpin ) ;
    fseek (fpin , 0 , SEEK_END); 
    input_size = ftell (fpin); // obtain file size
    rewind (fpin);
    h_input_string = (char *) malloc (sizeof(char)*input_size);
    assert( NULL != h_input_string );

    h_matched_result = (int *) malloc (sizeof(int)*input_size);
    assert( NULL != h_matched_result );
    memset( h_matched_result, 0, sizeof(int)*input_size ) ;
     
    // copy the file into the buffer
    input_size = fread(h_input_string, 1, input_size, fpin);
    fclose(fpin); 

    // step 2: set device to GPU 1, allocate d_input_string and d_matched_result on GPU 1
    cuda_status = cudaSetDevice(1) ;
    assert( cudaSuccess == cuda_status ) ; 

    cuda_status = cudaMalloc((void **) &d_input_string, input_size);
    if ( cudaSuccess != cuda_status ){
        printf("Error: %s\n", cudaGetErrorString(cuda_status));
        exit(1) ;
    }
    printf("d_input_string = %p, its UVA info:\n", d_input_string );
    show_memoryType( (void *)d_input_string );
    
    // allocate GPU memory for matched result
    cuda_status = cudaMalloc((void **) &d_matched_result, sizeof(int)*input_size);
    if ( cudaSuccess != cuda_status ){
        printf("Error: %s\n", cudaGetErrorString(cuda_status));
        exit(1) ;
    }
    printf("d_matched_result = %p, its UVA info:\n", d_matched_result );
    show_memoryType( (void *)d_matched_result );
    
    // copy input string from host to device
    cudaMemcpy(d_input_string, h_input_string, input_size, cudaMemcpyHostToDevice);

   /*
    * step 3: set device to GPU 0 again, enable peer-to-peer access with device 1
    *    PFAC binds to GPU 0 and wants to access d_input_string and d_matched_result in GPU 1
    * WARNING: if cudaDeviceEnablePeerAccess() is disable, then error "unspecified launch failure" occurs
    */
    cuda_status = cudaSetDevice(0) ;
    assert( cudaSuccess == cuda_status ); 
    cuda_status = cudaDeviceEnablePeerAccess(1, 0); // enable peer to peer access 
    assert( cudaSuccess == cuda_status ); 
          
    // step 4: run PFAC on GPU by calling PFAC_matchFromDevice          
    PFAC_status = PFAC_matchFromDevice( handle, d_input_string, input_size, d_matched_result ) ;
    if ( PFAC_STATUS_SUCCESS != PFAC_status ){
        printf("Error: PFAC_matchFromDevice failed, %s\n", PFAC_getErrorString(PFAC_status) );
        exit(1) ;	
    }     
    
    cudaThreadSynchronize();
    cuda_status = cudaGetLastError();
    if ( cudaSuccess != cuda_status ){
        printf("Error: PFAC_matchFromDevice failed, %s\n", cudaGetErrorString(cuda_status));
        exit(1) ;        	
    }
    
    // copy the result data from device to host
    cudaMemcpy(h_matched_result, d_matched_result, sizeof(int)*input_size, cudaMemcpyDeviceToHost);

    // step 5: output matched result
    for (int i = 0; i < input_size; i++) {
        if (h_matched_result[i] != 0) {
            printf("At position %4d, match pattern %d\n", i, h_matched_result[i]);
        }
    }

    PFAC_destroy( handle ) ;

    free(h_input_string);
    free(h_matched_result); 
    cudaFree(d_input_string);
    cudaFree(d_matched_result);
    
    cudaThreadExit();
    
    return 0;
}