int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
                        THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output)
{
    // Grab the input tensor
    float * data_flat = THCudaTensor_data(state, features);
    float * rois_flat = THCudaTensor_data(state, rois);

    float * output_flat = THCudaTensor_data(state, output);

    // Number of ROIs
    int num_rois = THCudaTensor_size(state, rois, 0);
    int size_rois = THCudaTensor_size(state, rois, 1);
    if (size_rois != 5)
    {
        return 0;
    }

    // data height
    int data_height = THCudaTensor_size(state, features, 2);
    // data width
    int data_width = THCudaTensor_size(state, features, 3);
    // Number of channels
    int num_channels = THCudaTensor_size(state, features, 1);

    cudaStream_t stream = THCState_getCurrentStream(state);

    ROIAlignForwardLaucher(
        data_flat, spatial_scale, num_rois, data_height,
        data_width, num_channels, aligned_height,
        aligned_width, rois_flat,
        output_flat, stream);

    return 1;
}
int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
                        THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad)
{
    // Grab the input tensor
    float * top_grad_flat = THCudaTensor_data(state, top_grad);
    float * rois_flat = THCudaTensor_data(state, rois);

    float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);

    // Number of ROIs
    int num_rois = THCudaTensor_size(state, rois, 0);
    int size_rois = THCudaTensor_size(state, rois, 1);
    if (size_rois != 5)
    {
        return 0;
    }

    // batch size
    int batch_size = THCudaTensor_size(state, bottom_grad, 0);
    // data height
    int data_height = THCudaTensor_size(state, bottom_grad, 2);
    // data width
    int data_width = THCudaTensor_size(state, bottom_grad, 3);
    // Number of channels
    int num_channels = THCudaTensor_size(state, bottom_grad, 1);

    cudaStream_t stream = THCState_getCurrentStream(state);
    ROIAlignBackwardLaucher(
        top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
        data_width, num_channels, aligned_height,
        aligned_width, rois_flat,
        bottom_grad_flat, stream);

    return 1;
}
示例#3
0
文件: init.c 项目: ASAPPinc/cutorch
static int cutorch_Event_waitOn(lua_State *L)
{
  cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
  THCState *state = cutorch_getstate(L);
  THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0));

  return 0;
}
示例#4
0
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value)
{
  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
  cudaStream_t stream = THCState_getCurrentStream(state);
  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
                              cudaMemcpyHostToDevice,
                              stream));
  THCudaCheck(cudaStreamSynchronize(stream));
}
示例#5
0
scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
{
  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
  scalar_t value;
  cudaStream_t stream = THCState_getCurrentStream(state);
  THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t),
                              cudaMemcpyDeviceToHost, stream));
  THCudaCheck(cudaStreamSynchronize(stream));
  return value;
}
示例#6
0
文件: init.c 项目: ASAPPinc/cutorch
static int cutorch_Event_new(lua_State *L)
{
  cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t));
  THCudaCheck(cudaEventCreate(event));

  THCState *state = cutorch_getstate(L);
  THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state)));
  luaT_pushudata(L, event, "cutorch.Event");

  return 1;
}
示例#7
0
cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
{
  THCudaCheck(cudaGetLastError());
  cudaStream_t stream = THCState_getCurrentStream(state);
  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
  cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream);
  if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
    cudaGetLastError(); // reset OOM error
    (state->cutorchGCFunction)(state->cutorchGCData);
    err = allocator->malloc(allocator->state, ptr, size, stream);
  }
  return err;
}
示例#8
0
void cudnn_affine_grid_generator_backward(
    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
    THVoidTensor* grad_theta, THVoidTensor* grad_grid,
    int N, int C, int H, int W)
{
  CHECK(cudnnSetStream(handle, THCState_getCurrentStream(state)));
  assertSameGPU(dataType, grad_theta, grad_grid);
  checkIOSize(grad_theta, grad_grid, N, H, W);
  SpatialTransformerDescriptor desc;
  setSamplerDescriptor(desc, dataType, N, C, H, W);
  CHECK(cudnnSpatialTfGridGeneratorBackward(handle, desc.desc,
					    tensorPointer(dataType, grad_grid),
					    tensorPointer(dataType, grad_theta)));
}
示例#9
0
void THCTensor_(copyCPU)(THCState *state, THCTensor *self, struct THTensor *src)
{
  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");

  {
    THCTensor *selfc = THCTensor_(newContiguous)(state, self);
    src = THTensor_(newContiguous)(src);

    cudaStream_t stream = THCState_getCurrentStream(state);
    THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state,selfc),
                                THTensor_(data)(src),
                                THTensor_(nElement)(src) * sizeof(real),
                                cudaMemcpyHostToDevice,
                                stream));
    THCudaCheck(cudaStreamSynchronize(stream));

    THTensor_(free)(src);
    THCTensor_(freeCopyTo)(state, selfc, self);
  }
}
示例#10
0
cuda::Stream & prepareStream(cutorchInfo info) {
    cuda::setDevice(info.deviceID - 1);
    fakeStream.impl_ = cv::makePtr<FakeStreamImpl>(THCState_getCurrentStream(info.state));
    return *reinterpret_cast<cuda::Stream *>(&fakeStream);
}
示例#11
0
void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
{
  THArgCheck(size >= 0, 2, "invalid size");
  THAssert(self->allocator != NULL);
  int device;
  THCudaCheck(cudaGetDevice(&device));

  if(!(self->flag & TH_STORAGE_RESIZABLE))
    THError("Trying to resize storage that is not resizable");

  size_t elementSize = at::elementSize(self->scalar_type);

  if (self->allocator->realloc) {
    void * data_ptr = self->data_ptr;
    cudaError_t err = (*self->allocator->realloc)(
      self->allocatorContext,
      (void**)&(data_ptr),
      self->size * elementSize,
      size * elementSize, THCState_getCurrentStreamOnDevice(state, device));
    if (err != cudaSuccess) {
      THCudaCheck(err);
    }
    self->size = size;
    self->device = device;
    return;
  }

  if(size == 0)
  {
    if(self->flag & TH_STORAGE_FREEMEM) {
      THCudaCheck(
        (*self->allocator->free)(self->allocatorContext, self->data_ptr));
    }
    self->data_ptr = NULL;
    self->size = 0;
    self->device = device;
  }
  else
  {
    void *data = NULL;
    cudaError_t err =
      (*self->allocator->malloc)(self->allocatorContext,
                                 (void**)&(data),
                                 size * elementSize,
                                 THCState_getCurrentStreamOnDevice(state, device));
    THCudaCheck(err);

    if (self->data_ptr) {
      // Enable p2p access when the memcpy is across devices
      THCState_getPeerToPeerAccess(state, device, self->device);

      THCudaCheck(cudaMemcpyAsync(data,
                                  self->data_ptr,
                                  THMin(self->size, size) * elementSize,
                                  cudaMemcpyDeviceToDevice,
                                  THCState_getCurrentStream(state)));
      if(self->flag & TH_STORAGE_FREEMEM) {
        THCudaCheck(
          (*self->allocator->free)(self->allocatorContext, self->data_ptr));
      }
    }

    self->data_ptr = data;
    self->size = size;
    self->device = device;
  }
}