void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src) { THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match"); THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous"); THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous"); if (THTensor_(nElement)(self) == 0) return; // Perform the copy wrt the current stream on the CudaTensor's device. int tensorDevice = THCTensor_(getDevice)(state, src); int currentDevice; THCudaCheck(cudaGetDevice(¤tDevice)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(tensorDevice)); } THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self), THCTensor_(data)(state, src), THCTensor_(nElement)(state, src) * sizeof(real), cudaMemcpyDeviceToHost, THCState_getDeviceStream(state, tensorDevice, THCState_getCurrentStreamIndex(state)))); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(currentDevice)); } }
/* Usage: n = cutorch.getStream() Returns the current user stream for all devices in use (as previously set via cutorch.setStream(n). 0 is the default stream on the device and is its initial value. */ static int cutorch_getStream(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getCurrentStreamIndex(state)); return 1; }
void* THCState_getCurrentDeviceScratchSpace(THCState* state) { int device = -1; THCudaCheck(cudaGetDevice(&device)); int stream = THCState_getCurrentStreamIndex(state); return THCState_getDeviceScratchSpace(state, device, stream); }
static int cutorch_setDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaSetDevice(device)); THCRandom_setGenerator(state, device); /* The stream is per device, so update the stream as well */ THCState_setStream(state, device, THCState_getCurrentStreamIndex(state)); THCState_setBlasHandle(state, device, THCState_getCurrentBlasHandleIndex(state)); return 0; }
void THCState_setDevice(THCState *state, int device) { int curDev; THCudaCheck(cudaGetDevice(&curDev)); if (device != curDev) { THCudaCheck(cudaSetDevice(device)); THCRandom_setGenerator(state, device); THCudaBlas_setHandle(state, device); /* The stream is per device, so update the stream as well */ THCState_setStream(state, device, THCState_getCurrentStreamIndex(state)); } }
cudaStream_t THCState_getCurrentStream(THCState *state) { /* This is called at the point of kernel execution. For some debugging code or improperly instrumented kernels, `state` is null */ if (state) { int device; THCudaCheck(cudaGetDevice(&device)); int streamIndex = THCState_getCurrentStreamIndex(state); if (streamIndex == 0) { return NULL; } return THCState_getDeviceResourcePtr(state, device)->streams[streamIndex]; } else { /* assume default stream */ return NULL; } }