Ejemplo n.º 1
0
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    timer->Start();
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);
    *timing << " add_grad: " << timer->MilliSeconds();
  }

  // Send gradients to parent
  if (parent_) {
    timer->Start();
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == parent_->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
    *timing << " send_grad: " << timer->MilliSeconds();
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}
Ejemplo n.º 2
0
static int uct_is_gdr_copy_mem_type_owned(uct_md_h md, void *addr, size_t length)
{
    int memory_type;
    struct cudaPointerAttributes attributes;
    cudaError_t cuda_err;
    CUresult cu_err;

    if (addr == NULL) {
        return 0;
    }

    cu_err = cuPointerGetAttribute(&memory_type,
                                   CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
                                   (CUdeviceptr)addr);
    if (cu_err != CUDA_SUCCESS) {
        cuda_err = cudaPointerGetAttributes (&attributes, addr);
        if (cuda_err == cudaSuccess) {
            if (attributes.memoryType == cudaMemoryTypeDevice) {
                return 1;
            }
        }
    } else if (memory_type == CU_MEMORYTYPE_DEVICE) {
        return 1;
    }
    return 0;
}
TEST(PointerGetAttributes, NonNull) {
    cudaError_t ret;
    struct cudaPointerAttributes attr;
    void * ptr;

    ret = cudaPointerGetAttributes(NULL,  NULL);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaPointerGetAttributes(&attr, NULL);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaPointerGetAttributes(NULL,  &ptr);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaPointerGetAttributes(&attr, &ptr);
    EXPECT_EQ(cudaErrorInvalidValue, ret);
}
Ejemplo n.º 4
0
void P2PSync<Dtype>::on_start(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#else
//  CHECK(false);
#endif

  // Wait for update from parent
  if (parent_) {
    timer->Start();
    P2PSync<Dtype> *parent = queue_.pop();
    CHECK(parent == parent_);
    *timing << " recv_param: " << timer->MilliSeconds();
  }

  // Update children
  if (children_.size()) {
    timer->Start();
  }
  for (int i = children_.size() - 1; i >= 0; i--) {
    Dtype* src = data_;
    Dtype* dst = children_[i]->data_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == children_[i]->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    children_[i]->queue_.push(this);
  }
  if (children_.size()) {
    *timing << " send_param: " << timer->MilliSeconds();
  }
#endif
}
Ejemplo n.º 5
0
bool cuda_accessible(const void* ptr)
{
#if 1
	return mem_device_accessible(ptr);
#else
	struct cudaPointerAttributes attr;
	//CUDA_ERROR(cudaPointerGetAttributes(&attr, ptr));
	if (cudaSuccess != (cudaPointerGetAttributes(&attr, ptr)))
		return false;

	return true;
#endif
}
Ejemplo n.º 6
0
void P2PSync<Dtype>::on_start() {
#ifndef CPU_ONLY
#ifdef USE_CUDA
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#else
//  CHECK(false);
#endif

  // Wait for update from parent
  if (parent_) {
    P2PSync<Dtype> *parent = queue_.pop();
    CHECK(parent == parent_);
  }

  // Update children
  for (int i = children_.size() - 1; i >= 0; i--) {
    Dtype* src = data_;
    Dtype* dst = children_[i]->data_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == children_[i]->solver_->param().device_id());
#endif

    CUDA_CHECK(
        cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
                        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    children_[i]->queue_.push(this);
  }
#endif  // USE_CUDA
#endif  // !CPU_ONLY
}
Ejemplo n.º 7
0
bool cuda_accessible(const void* ptr)
{
#if 1
	struct cuda_mem_s* p = search(ptr, false);	
	return (NULL != p);
#else
	struct cudaPointerAttributes attr;
	//CUDA_ERROR(cudaPointerGetAttributes(&attr, ptr));
	if (cudaSuccess != (cudaPointerGetAttributes(&attr, ptr)))
		return false;

	return true;
#endif
}
Ejemplo n.º 8
0
TEST(Malloc3DArray, Attributes) {
    struct cudaArray * ary;
    struct cudaChannelFormatDesc dsc;
    dsc.x = dsc.y = dsc.z = dsc.w = 8;
    dsc.f = cudaChannelFormatKindSigned;

    cudaError_t ret;

    ret = cudaMalloc3DArray(&ary, &dsc, make_cudaExtent(1, 1, 1), 0);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaPointerAttributes attr;
    ret = cudaPointerGetAttributes(&attr, ary);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    EXPECT_EQ(cudaSuccess, cudaFreeArray(ary));
}
Ejemplo n.º 9
0
static bool cuda_cuda_ondevice(const void* ptr)
{
	if (NULL == ptr)
		return false;

	struct cudaPointerAttributes attr;
	if (cudaSuccess != (cudaPointerGetAttributes(&attr, ptr)))
	{
	/* The secret trick to make this work for arbitrary pointers
	   is to clear the error using cudaGetLastError. See end of:
	   http://www.alexstjohn.com/WP/2014/04/28/cuda-6-0-first-look/
	 */
		cudaGetLastError();
		return false;
	}

	return (cudaMemoryTypeDevice == attr.memoryType);
}
Ejemplo n.º 10
0
void show_memoryType( void *ptr )
{
    cudaError_t cuda_status ;
    struct cudaPointerAttributes  attributes;
    
    cuda_status = cudaPointerGetAttributes( &attributes, ptr);
    assert( cudaSuccess == cuda_status ) ;
    
    if ( cudaMemoryTypeHost == attributes.memoryType ){
        printf("\tptr belongs to host memory, device = %d, hostPointer = %p\n", 
            attributes.device, attributes.hostPointer );
    }else if ( cudaMemoryTypeDevice == attributes.memoryType ){
        printf("\tptr belongs to device memory, device = %d, devicePointer = %p\n", 
            attributes.device, attributes.devicePointer );
    }else{
        printf("Error: unknown .memoryType %d\n", attributes.memoryType);	
        exit(1);
    }
    
}
TEST(PointerGetAttributes, Array) {
    struct cudaArray * ary;
    cudaError_t ret;

    struct cudaChannelFormatDesc dsc;
    dsc.x = dsc.y = dsc.z = dsc.w = 8;
    dsc.f = cudaChannelFormatKindSigned;

    int device;
    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMallocArray(&ary, &dsc, 1, 1, 0);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaPointerAttributes attr;
    ret = cudaPointerGetAttributes(&attr, ary);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaFreeArray(ary);
    ASSERT_EQ(cudaSuccess, ret);
}
TEST(PointerGetAttributes, Malloc) {
    cudaError_t ret;
    void * ptr;

    int device;
    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMalloc(&ptr, sizeof(1));
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaPointerAttributes attr;
    ret = cudaPointerGetAttributes(&attr, ptr);
    ASSERT_EQ(cudaSuccess, ret);

    EXPECT_EQ(device, attr.device);
    EXPECT_EQ(ptr, attr.devicePointer);
    EXPECT_EQ(NULL, attr.hostPointer);
    EXPECT_EQ(cudaMemoryTypeDevice, attr.memoryType);

    ret = cudaFree(ptr);
    ASSERT_EQ(cudaSuccess, ret);
}