int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) { // Grab the input tensor float * data_flat = THCudaTensor_data(state, features); float * rois_flat = THCudaTensor_data(state, rois); float * output_flat = THCudaTensor_data(state, output); // Number of ROIs int num_rois = THCudaTensor_size(state, rois, 0); int size_rois = THCudaTensor_size(state, rois, 1); if (size_rois != 5) { return 0; } // data height int data_height = THCudaTensor_size(state, features, 2); // data width int data_width = THCudaTensor_size(state, features, 3); // Number of channels int num_channels = THCudaTensor_size(state, features, 1); cudaStream_t stream = THCState_getCurrentStream(state); ROIAlignForwardLaucher( data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, aligned_height, aligned_width, rois_flat, output_flat, stream); return 1; }
int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) { // Grab the input tensor float * top_grad_flat = THCudaTensor_data(state, top_grad); float * rois_flat = THCudaTensor_data(state, rois); float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); // Number of ROIs int num_rois = THCudaTensor_size(state, rois, 0); int size_rois = THCudaTensor_size(state, rois, 1); if (size_rois != 5) { return 0; } // batch size int batch_size = THCudaTensor_size(state, bottom_grad, 0); // data height int data_height = THCudaTensor_size(state, bottom_grad, 2); // data width int data_width = THCudaTensor_size(state, bottom_grad, 3); // Number of channels int num_channels = THCudaTensor_size(state, bottom_grad, 1); cudaStream_t stream = THCState_getCurrentStream(state); ROIAlignBackwardLaucher( top_grad_flat, spatial_scale, batch_size, num_rois, data_height, data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat, stream); return 1; }
static int cutorch_Event_waitOn(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCState *state = cutorch_getstate(L); THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0)); return 0; }
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value) { THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); }
scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); scalar_t value; cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t), cudaMemcpyDeviceToHost, stream)); THCudaCheck(cudaStreamSynchronize(stream)); return value; }
static int cutorch_Event_new(lua_State *L) { cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t)); THCudaCheck(cudaEventCreate(event)); THCState *state = cutorch_getstate(L); THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state))); luaT_pushudata(L, event, "cutorch.Event"); return 1; }
cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size) { THCudaCheck(cudaGetLastError()); cudaStream_t stream = THCState_getCurrentStream(state); THCDeviceAllocator* allocator = state->cudaDeviceAllocator; cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream); if (state->cutorchGCFunction != NULL && err != cudaSuccess) { cudaGetLastError(); // reset OOM error (state->cutorchGCFunction)(state->cutorchGCData); err = allocator->malloc(allocator->state, ptr, size, stream); } return err; }
void cudnn_affine_grid_generator_backward( THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType, THVoidTensor* grad_theta, THVoidTensor* grad_grid, int N, int C, int H, int W) { CHECK(cudnnSetStream(handle, THCState_getCurrentStream(state))); assertSameGPU(dataType, grad_theta, grad_grid); checkIOSize(grad_theta, grad_grid, N, H, W); SpatialTransformerDescriptor desc; setSamplerDescriptor(desc, dataType, N, C, H, W); CHECK(cudnnSpatialTfGridGeneratorBackward(handle, desc.desc, tensorPointer(dataType, grad_grid), tensorPointer(dataType, grad_theta))); }
void THCTensor_(copyCPU)(THCState *state, THCTensor *self, struct THTensor *src) { THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match"); { THCTensor *selfc = THCTensor_(newContiguous)(state, self); src = THTensor_(newContiguous)(src); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state,selfc), THTensor_(data)(src), THTensor_(nElement)(src) * sizeof(real), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); THTensor_(free)(src); THCTensor_(freeCopyTo)(state, selfc, self); } }
cuda::Stream & prepareStream(cutorchInfo info) { cuda::setDevice(info.deviceID - 1); fakeStream.impl_ = cv::makePtr<FakeStreamImpl>(THCState_getCurrentStream(info.state)); return *reinterpret_cast<cuda::Stream *>(&fakeStream); }
void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); THAssert(self->allocator != NULL); int device; THCudaCheck(cudaGetDevice(&device)); if(!(self->flag & TH_STORAGE_RESIZABLE)) THError("Trying to resize storage that is not resizable"); size_t elementSize = at::elementSize(self->scalar_type); if (self->allocator->realloc) { void * data_ptr = self->data_ptr; cudaError_t err = (*self->allocator->realloc)( self->allocatorContext, (void**)&(data_ptr), self->size * elementSize, size * elementSize, THCState_getCurrentStreamOnDevice(state, device)); if (err != cudaSuccess) { THCudaCheck(err); } self->size = size; self->device = device; return; } if(size == 0) { if(self->flag & TH_STORAGE_FREEMEM) { THCudaCheck( (*self->allocator->free)(self->allocatorContext, self->data_ptr)); } self->data_ptr = NULL; self->size = 0; self->device = device; } else { void *data = NULL; cudaError_t err = (*self->allocator->malloc)(self->allocatorContext, (void**)&(data), size * elementSize, THCState_getCurrentStreamOnDevice(state, device)); THCudaCheck(err); if (self->data_ptr) { // Enable p2p access when the memcpy is across devices THCState_getPeerToPeerAccess(state, device, self->device); THCudaCheck(cudaMemcpyAsync(data, self->data_ptr, THMin(self->size, size) * elementSize, cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); if(self->flag & TH_STORAGE_FREEMEM) { THCudaCheck( (*self->allocator->free)(self->allocatorContext, self->data_ptr)); } } self->data_ptr = data; self->size = size; self->device = device; } }