Beispiel #1
0
void
SentinelWalker<DistanceEvaluator, ResultsCalculator>::walk
(size_t gridSize, size_t blockSize, size_t minWalks,
 DistanceEvaluator& distance, ResultsCalculator& results)
{
  size_t threads = gridSize * blockSize;
  size_t walksPerThread = minWalks / threads + 1;
  
  srand(time(NULL));

  ResultsCalculator* d_results;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_results, sizeof(ResultsCalculator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_results, &results, sizeof(ResultsCalculator),
			 cudaMemcpyHostToDevice));
  
  DistanceEvaluator* d_distance;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator),
			 cudaMemcpyHostToDevice));

  cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize());

  cudaStream_t workStream;
  cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking));

  cudaStream_t sentinelStream;
  cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking));
  
  double start = seconds();
  
  gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>>
    (rand(), d_distance, d_results);

  size_t runningTotal;
  do {
    runningTotal = results.getRunningTotal(sentinelStream);
    //std::cout << runningTotal << std::endl;
  } while (runningTotal < minWalks);

  std::cout << "Stopping." << std::endl;
  
  results.endRun(sentinelStream);

  std::cout << "Stop signal sent." << std::endl;
  
  cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream));
  
  double timeSpent      = seconds() - start;
  double totalWalks     = walksPerThread * threads;
  double timePerWalk    = timeSpent / totalWalks;
  double walksPerSecond = totalWalks / timeSpent;
  
  std::cout << "Total GPU Time:   " << timeSpent      << "s"<< std::endl;
  std::cout << "Time Per Walk:    " << timePerWalk    << "s" << std::endl;
  std::cout << "Walks Per Second: " << walksPerSecond << "s" << std::endl;
}
Beispiel #2
0
void CopySegment(int a, int b)
{
    void *deva_buff = nullptr, *devb_buff = nullptr;
    void *deva_buff2 = nullptr, *devb_buff2 = nullptr;

    cudaStream_t a_stream, b_stream;

    // Allocate buffers
    CUDA_CHECK(cudaSetDevice(a));
    CUDA_CHECK(cudaMalloc(&deva_buff, FLAGS_size));
    CUDA_CHECK(cudaMalloc(&deva_buff2, FLAGS_size));
    CUDA_CHECK(cudaStreamCreateWithFlags(&a_stream, cudaStreamNonBlocking));
    CUDA_CHECK(cudaSetDevice(b));
    CUDA_CHECK(cudaMalloc(&devb_buff, FLAGS_size));
    CUDA_CHECK(cudaMalloc(&devb_buff2, FLAGS_size));
    CUDA_CHECK(cudaStreamCreateWithFlags(&b_stream, cudaStreamNonBlocking));

    // Synchronize devices before copying
    CUDA_CHECK(cudaSetDevice(a));
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaSetDevice(b));
    CUDA_CHECK(cudaDeviceSynchronize());

    
    
    // Exchange
    auto t1 = std::chrono::high_resolution_clock::now();
    for(uint64_t i = 0; i < FLAGS_repetitions; ++i)
    {
        CUDA_CHECK(cudaMemcpyPeerAsync(devb_buff, b, deva_buff, a,
                                       FLAGS_size, b_stream));
        CUDA_CHECK(cudaMemcpyPeerAsync(deva_buff2, a, devb_buff2, b,
                                       FLAGS_size, a_stream));
    }
    CUDA_CHECK(cudaSetDevice(a));
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaSetDevice(b));
    CUDA_CHECK(cudaDeviceSynchronize());
    auto t2 = std::chrono::high_resolution_clock::now();

    double mstime = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000.0 / FLAGS_repetitions;

    // MiB/s = [bytes / (1024^2)] / [ms / 1000]
    double MBps = (FLAGS_size / 1024.0 / 1024.0) / (mstime / 1000.0);
    
    printf("%.2lf MB/s (%lf ms)\n", MBps, mstime);
    
    // Free buffers
    CUDA_CHECK(cudaSetDevice(a));
    CUDA_CHECK(cudaFree(deva_buff));
    CUDA_CHECK(cudaFree(deva_buff2));
    CUDA_CHECK(cudaStreamDestroy(a_stream));
    CUDA_CHECK(cudaSetDevice(b));
    CUDA_CHECK(cudaFree(devb_buff));
    CUDA_CHECK(cudaFree(devb_buff2));
    CUDA_CHECK(cudaStreamDestroy(b_stream));
}
Beispiel #3
0
void
HistogramWalker<DistanceEvaluator, ResultsCalculator>::walk
(size_t gridSize, size_t blockSize, size_t minWalks,
 DistanceEvaluator& distance, ResultsCalculator& results)
{
#ifdef PORTION
  std::cout << "Portion Walk Sizes:" << std::endl;
#endif
#ifdef EARLYSENTINEL
  std::cout << "Early Sentinel Walk Sizes:" << std::endl;
#endif
#ifdef WAITSENTINEL
  std::cout << "Wait Sentinel Walk Sizes:" << std::endl;
#endif

  size_t threads = gridSize * blockSize;
  size_t walksPerThread = minWalks / threads + 1;
  
  srand(time(NULL));

  ResultsCalculator* d_results;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_results, sizeof(ResultsCalculator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_results, &results, sizeof(ResultsCalculator),
			 cudaMemcpyHostToDevice));
  
  DistanceEvaluator* d_distance;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator),
			 cudaMemcpyHostToDevice));

  cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize());

  cudaStream_t workStream;
  cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking));

  cudaStream_t sentinelStream;
  cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking));
  
  double start = seconds();
  
  gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>>
    (walksPerThread, rand(), d_distance, d_results);

  size_t runningTotal;
  do {
    runningTotal = results.getRunningTotal(sentinelStream);
  } while (runningTotal < minWalks);
  
  results.endRun(sentinelStream);

  cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream));
}
Beispiel #4
0
GPUDataTransferer::GPUDataTransferer(int deviceId, bool useConcurrentStreams) 
{
#pragma warning(disable : 4127)
    if (useConcurrentStreams && (s_fetchStream == NULL))
    {
        cudaStreamCreateWithFlags(&s_fetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
        cudaStreamCreateWithFlags(&s_assignStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
    }

    m_inner = make_unique<GranularGPUDataTransferer>(deviceId, s_fetchStream, s_assignStream);
}
Beispiel #5
0
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  cudaStream_t stream2;
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      Batch<Dtype>* batch = prefetch_free_.pop();
      Batch<Dtype>* batch_untransformed = NULL;
      if (untransformed_top_)
        {
          batch_untransformed = prefetch_free_untransformed_.pop();
          load_batch_and_untransformed_batch(batch,batch_untransformed);
        }
      else
        load_batch(batch);

#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
        if (untransformed_top_)
          {
            batch_untransformed->data_.data().get()->async_gpu_push(stream2);
            CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
          }
      }
#endif
      prefetch_full_.push(batch);
      if (untransformed_top_)
        prefetch_full_untransformed_.push(batch_untransformed);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
  }
#endif
}
Beispiel #6
0
    cudaStream_t target::native_handle_type::get_stream() const
    {
        std::lock_guard<mutex_type> l(mtx_);

        if (stream_ == 0)
        {
            cudaError_t error = cudaSetDevice(device_);
            if (error != cudaSuccess)
            {
                HPX_THROW_EXCEPTION(kernel_error,
                    "cuda::target::native_handle::get_stream()",
                    std::string("cudaSetDevice failed: ") +
                        cudaGetErrorString(error));
            }
            error = cudaStreamCreateWithFlags(&stream_,
                cudaStreamNonBlocking);
            if (error != cudaSuccess)
            {
                HPX_THROW_EXCEPTION(kernel_error,
                    "cuda::target::native_handle::get_stream()",
                    std::string("cudaStreamCreate failed: ") +
                        cudaGetErrorString(error));
            }
        }
        return stream_;
    }
Beispiel #7
0
ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt,
                                        uint64_t remote_addr, uct_rkey_t rkey,
                                        uct_completion_t *comp)
{
    uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t);
    ucs_status_t status;

    if (iface->stream_d2h == 0) {
        status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_d2h,
                               cudaStreamNonBlocking));
        if (UCS_OK != status) {
            return UCS_ERR_IO_ERROR;
        }
    }

    status = uct_cuda_copy_post_cuda_async_copy(tl_ep, iov[0].buffer, (void *)remote_addr,
                                                iov[0].length, cudaMemcpyDeviceToHost,
                                                iface->stream_d2h,
                                                &iface->outstanding_d2h_cuda_event_q, comp);

    UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY,
                      uct_iov_total_length(iov, iovcnt));
    uct_cuda_copy_trace_data(remote_addr, rkey, "GET_ZCOPY [length %zu]",
                             uct_iov_total_length(iov, iovcnt));
    return status;
}
Beispiel #8
0
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;//创建CUDA stream,非阻塞类型
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) { //循环载入批量数据
      Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
      load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        if (this->output_labels_) {
          batch->label_.data().get()->async_gpu_push(stream);
        }
        CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
      }
#endif
      prefetch_full_.push(batch);//加入到带负载的Batch队列中
    }
  } catch (boost::thread_interrupted&) {//捕获异常,退出while循环
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
  }
#endif
}
Beispiel #9
0
void BasePrefetchingLabelmapDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      LabelmapBatch<Dtype>* batch = prefetch_free_.pop();
      load_batch(batch);
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }
#endif
      prefetch_full_.push(batch);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));
  }
#endif
}
Beispiel #10
0
GPUDataTransferer<ElemType>::GPUDataTransferer(int deviceId, bool useConcurrentStreams)
    : m_deviceId(deviceId)
{
    PrepareDevice(m_deviceId);

    // events
    // Note: Do NOT use cudaEventBlockingSync (which supposedly yields the process)--it will totally break cudaEventSynchronize(), causing it to take 50 or 100 ms randomly.
    cudaEventCreateWithFlags(&m_fetchCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed";
    cudaEventCreateWithFlags(&m_assignCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed";

#pragma warning(disable : 4127)
    if (useConcurrentStreams && (m_fetchStream == NULL))
    {
        cudaStreamCreateWithFlags(&m_fetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
        cudaStreamCreateWithFlags(&m_assignStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
    }
}
Beispiel #11
0
THCStream* THCStream_new(int flags)
{
  THCStream* self = (THCStream*) malloc(sizeof(THCStream));
  self->refcount = 1;
  THCudaCheck(cudaGetDevice(&self->device));
  THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
  return self;
}
Beispiel #12
0
PrefetchGPUDataTransferer::PrefetchGPUDataTransferer(int deviceId) : GranularGPUDataTransferer(deviceId, s_gpuToCpuStream, s_prefetchStream, true)
{
#pragma warning(disable : 4127)
    if (s_prefetchStream == nullptr)
    {
        // Assign stream always stays null, not required for prefetch.

        // TODO: Currently the s_prefetchStream is not destroyed.
        // As static it can be used in several readers with different lifecycle so we allow it to live till the end.
        cudaStreamCreateWithFlags(&s_prefetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
    }
}
Beispiel #13
0
void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
{
  if (numStreams <= state->numUserStreams)
  {
    return;
  }

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Otherwise, we have to allocate a new set of streams and stream data */
  for (int dev = 0; dev < state->numDevices; ++dev) {
    THCudaCheck(cudaSetDevice(dev));

    /* +1 for the default stream as well */
    cudaStream_t* newStreams =
      (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t));

    void** newScratchSpace =
      (void**) malloc((numStreams + 1) * sizeof(void*));

    /* Copy over old stream data
       (0 is default stream, 1 ... numUserStreams are rest) */
    for (int stream = 0; stream <= state->numUserStreams; ++stream) {
      newStreams[stream] =
        THCState_getDeviceStream(state, dev, stream);
      newScratchSpace[stream] =
        THCState_getDeviceScratchSpace(state, dev, stream);
    }

    /* Allocate new stream resources */
    size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
    unsigned int flags =
      nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;

    for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
      newStreams[stream] = NULL;
      THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags));
      newScratchSpace[stream] = NULL;
      THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
    }

    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
    free(res->streams);
    res->streams = newStreams;
    free(res->devScratchSpacePerStream);
    res->devScratchSpacePerStream = newScratchSpace;
  }

  state->numUserStreams = numStreams;

  THCudaCheck(cudaSetDevice(prevDev));
}
int main(void)
{
    int *a = (int*)malloc(sizeof(int)*N);
    if(a == NULL){
	return 1;
    }
    cudaStream_t st;
    cudaError_t error = cudaStreamCreateWithFlags(&st, cudaStreamNonBlocking);
    if(error != cudaSuccess){
	return 1;
    }

    acc_set_cuda_stream(2, st);

    for(int i = 0; i < N; i++){
	a[i] = i;
    }

#pragma acc data copyout(a[0:N])
    {
	int *dev_a;

#pragma acc host_data use_device(a)
	dev_a = a;

	cudaMemcpyAsync(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice, st);

#pragma acc parallel loop async(2)
	for(int i = 0; i < N; i++){
	    a[i] += i;
	}

#pragma acc wait(2)
    }

    for(int i = 0; i < N; i++){
	if(a[i] != i*2) return 1;
    }

    printf("PASS\n");
    return 0;
}
Beispiel #15
0
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
#ifdef USE_CUDA
  cudaStream_t stream;
  if (Caffe::mode() == Caffe::GPU) {
    if (this->get_device()->backend() == BACKEND_CUDA) {
      CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    }
  }
#endif  // USE_CUDA
#endif  // !CPU_ONLY

  try {
    while (!must_stop()) {
      Batch<Dtype>* batch = prefetch_free_.pop();
      load_batch(batch);
#ifndef CPU_ONLY
#ifdef USE_CUDA
      if (Caffe::mode() == Caffe::GPU) {
        if (this->get_device()->backend() == BACKEND_CUDA) {
          batch->data_.data().get()->async_gpu_push(stream);
          CUDA_CHECK(cudaStreamSynchronize(stream));
        }
      }
#endif  // USE_CUDA
#endif  // !CPU_ONLY
      prefetch_full_.push(batch);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
#ifdef USE_CUDA
  if (Caffe::mode() == Caffe::GPU) {
    if (this->get_device()->backend() == BACKEND_CUDA) {
      CUDA_CHECK(cudaStreamDestroy(stream));
    }
  }
#endif  // USE_CUDA
#endif  // !CPU_ONLY
}
Beispiel #16
0
PrefetchGPUDataTransferer::PrefetchGPUDataTransferer(int deviceId) : GranularGPUDataTransferer(deviceId, nullptr, nullptr, true)
{
     cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed (PrefetchGPUDataTransferer ctor)";
}
Beispiel #17
0
    void gpu_data::
    set_size(
        size_t new_size
    )
    {
        if (new_size == 0)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                CHECK_CUDA(cudaStreamSynchronize(0));
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = 0;
            host_current = true;
            device_current = true;
            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
        else if (new_size != data_size)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                CHECK_CUDA(cudaStreamSynchronize(0));
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = new_size;
            host_current = true;
            device_current = true;
            device_in_use = false;

            try
            {
                CHECK_CUDA(cudaGetDevice(&the_device_id));

                // free memory blocks before we allocate new ones.
                data_host.reset();
                data_device.reset();

                void* data;
                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                // Note that we don't throw exceptions since the free calls are invariably
                // called in destructors.  They also shouldn't fail anyway unless someone
                // is resetting the GPU card in the middle of their program.
                data_host.reset((float*)data, [](float* ptr){
                    auto err = cudaFreeHost(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
                data_device.reset((float*)data, [](float* ptr){
                    auto err = cudaFree(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                if (!cuda_stream)
                {
                    cudaStream_t cstream;
                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
                    cuda_stream.reset(cstream, [](void* ptr){
                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
                        if(err!=cudaSuccess)
                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                    });
                }

            }
            catch(...)
            {
                set_size(0);
                throw;
            }
        }
    }
Beispiel #18
0
void NCCL<Dtype>::Init() {
  if (solver_->param().layer_wise_reduce()) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
  }
}
Beispiel #19
0
gaspi_return_t
gaspi_init_GPUs()
{
  int i, j, k;
  int deviceCount;
  int device_id = 0;
  int gaspi_devices = 0;
  int ib_numa_node;
  int direct_devices[32];
  struct cudaDeviceProp deviceProp;

  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
  if ( error_id != cudaSuccess )
  {
    gaspi_print_error("Failed cudaGetDeviceCount." );
    return GASPI_ERROR;
  }

  if( deviceCount <= 0 )
  {
    gaspi_print_error("No CUDA capable devices found.");
    return GASPI_ERROR;
  }

  ib_numa_node = _gaspi_find_GPU_ib_numa_node();

  for(device_id = 0; device_id < deviceCount; device_id++)
  {
    cudaGetDeviceProperties(&deviceProp, device_id);
    if( deviceProp.major >= 3 ) /* TODO: magic number */
    {
      cudaSetDevice(device_id);
      if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) )
      {
	direct_devices[gaspi_devices] = device_id;
	gaspi_devices++;
      }
    }
  }

  if( 0 == gaspi_devices )
  {
    gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found.");
    return GASPI_ERROR;
  }

  glb_gaspi_ctx.gpu_count = gaspi_devices;

  gpus = (gaspi_gpu *) malloc(sizeof(gaspi_gpu)*glb_gaspi_ctx.gpu_count);
  if( !gpus )
    {
      gaspi_print_error("Failed to allocate mameory.");
      return GASPI_ERR_MEMALLOC;
    }

  for(k = 0 ; k < gaspi_devices; k++)
  {
    cudaSetDevice(direct_devices[k]);

    for( i = 0; i < GASPI_MAX_QP; i++)
    {
      cudaStreamCreate(&gpus[k].streams[i]);
      for(j = 0; j < GASPI_CUDA_EVENTS; j++)
      {
	cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming);
      }

      cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking);
    }

    gpus[k].device_id = direct_devices[k];
  }

  glb_gaspi_ctx.use_gpus = 1;

  return GASPI_SUCCESS;
}
Beispiel #20
0
/*
 * Function to be called
 */
void* device_thread(void* passing_ptr) {
    DataArray* data_arr_ptr = (DataArray*) passing_ptr; // casting passed pointer


    cuDoubleComplex* data_r_dev;
    cuDoubleComplex* data_k_dev;


    // init device, allocate suitable variables in gpu memory ...
    //alloc_data_device(data_arr_ptr);
    cudaMalloc((void**) &data_r_dev, sizeof(double complex)*N); // pinnable memory <- check here for cudaMallocHost (could be faster)
    cudaMalloc((void**) &data_k_dev, sizeof(double complex)*N); // pinnable memory
    data_arr_ptr->data_r_dev = &data_r_dev; // in this way it would be easier to handle pointer to arrays
    data_arr_ptr->data_k_dev = &data_k_dev;
    printf("data allocated by host thread\n");

    // Each thread creates new stream ustomatically???
    // http://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
    cudaStreamCreateWithFlags(streams_arr, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(streams_arr+1, cudaStreamNonBlocking);
    printf("streams created\n");

    // synchronize after allocating memory - data on host should be allocated and ready for copying
    cudaDeviceSynchronize(); // CHECK IF THIS DO NOT CAUSE ERRORS! - should syncronize host and device irrespective on pthreads
    // cudaStreamSynchronize( <enum stream> ); // to synchronize only with stream !!!
    pthread_barrier_wait (&barrier);
    printf("1st barier device thread - allocating mem on gpu\n");




    //copying data
    cudaMemcpyAsync( *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_r), N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, streams_arr[MEMORY_STREAM] );

    // synchronize after copying data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("2nd barier device thread - copying data on gpu\n");






    printf("data visible in device thread:\n");

    /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf("%lu.\t",ii);
      printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] ));
      printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] ));
    }*/

    // synchronize after copying
    pthread_barrier_wait (&barrier);
    printf("3rd barier device thread - \n");



    //copying data
    //cudaMemcpyAsync( *(data_arr_ptr->data_r), *(data_arr_ptr->data_r_dev), N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );
    cudaMemcpyAsync( *(data_arr_ptr->data_r), data_r_dev, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );


    // synchronize after copying back data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("4th barier device thread - \n");


    cudaStreamDestroy(streams_arr[KERNEL_STREAM]);
    cudaStreamDestroy(streams_arr[MEMORY_STREAM]);

    cudaFree(data_r_dev);
    printf("device r space freed\n");
    cudaFree(data_k_dev);
    cudaDeviceSynchronize();
    printf("device k space freed\n");

    printf("closing device thread\n");
    pthread_exit(NULL);
}
Beispiel #21
0
gaspi_return_t
gaspi_gpu_init(void)
{
  gaspi_context_t * const gctx = &glb_gaspi_ctx;
  int deviceCount;
  cudaError_t cuda_error_id = cudaGetDeviceCount(&deviceCount);
  if( cuda_error_id != cudaSuccess )
    {
      gaspi_print_error("Failed cudaGetDeviceCount." );
      return GASPI_ERR_DEVICE;
    }

  if( deviceCount <= 0 )
    {
      gaspi_print_error("No CUDA capable devices found.");
      return GASPI_ERR_DEVICE;
    }

  const int ib_numa_node = _gaspi_find_dev_numa_node();

  int device_id = 0;
  int gaspi_devices = 0;
  int direct_devices[GPI2_GPU_MAX_DIRECT_DEVS];
  struct cudaDeviceProp deviceProp;
  for(device_id = 0; device_id < deviceCount; device_id++)
    {
      //TODO: possibly add functionality to show properties structure
      cuda_error_id = cudaGetDeviceProperties(&deviceProp, device_id);
      if( cuda_error_id != cudaSuccess)
	{
	  return GASPI_ERR_DEVICE;
	}

      if( deviceProp.major >= 3 ) /* TODO: magic number */
	{
	  cuda_error_id = cudaSetDevice(device_id);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	  if( ib_numa_node == _gaspi_find_GPU_numa_node(device_id) )
	    {
	      if( gaspi_devices < GPI2_GPU_MAX_DIRECT_DEVS - 1 )
		{
		  direct_devices[gaspi_devices] = device_id;
		  gaspi_devices++;
		}
	    }
	}
    }

  if( 0 == gaspi_devices )
    {
      gaspi_print_error("No GPU Direct RDMA capable devices on the correct NUMA-socket were found.");
      return GASPI_ERROR;
    }

  gpus = (gaspi_gpu_t*) malloc(sizeof(gaspi_gpu_t) * gaspi_devices);
  if( gpus == NULL )
    {
      gaspi_print_error("Failed to allocate memory.");
      return GASPI_ERR_MEMALLOC;
    }

  int i, j, k;
  for(k = 0 ; k < gaspi_devices; k++)
    {
      cuda_error_id = cudaSetDevice(direct_devices[k]);
      if( cuda_error_id != cudaSuccess )
	{
	  return GASPI_ERR_DEVICE;
	}

      for(i = 0; i < GASPI_MAX_QP; i++)
	{
	  cuda_error_id = cudaStreamCreate(&gpus[k].streams[i]);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	  for(j = 0; j < GASPI_CUDA_EVENTS; j++)
	    {
	      cuda_error_id = cudaEventCreateWithFlags(&gpus[k].events[i][j].event, cudaEventDisableTiming);
	      if( cuda_error_id != cudaSuccess )
		{
		  return GASPI_ERR_DEVICE;
		}
	    }

	  cuda_error_id = cudaStreamCreateWithFlags(&gpus[k].streams[i], cudaStreamNonBlocking);
	  if( cuda_error_id != cudaSuccess )
	    {
	      return GASPI_ERR_DEVICE;
	    }

	}

      gpus[k].device_id = direct_devices[k];
    }

  gctx->gpu_count = gaspi_devices;
  gctx->use_gpus = 1;

  return GASPI_SUCCESS;
}