void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src) { THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match"); THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous"); THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous"); if (THTensor_(nElement)(self) == 0) return; // Perform the copy wrt the current stream on the CudaTensor's device. int tensorDevice = THCTensor_(getDevice)(state, src); int currentDevice; THCudaCheck(cudaGetDevice(¤tDevice)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(tensorDevice)); } THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self), THCTensor_(data)(state, src), THCTensor_(nElement)(state, src) * sizeof(real), cudaMemcpyDeviceToHost, THCState_getDeviceStream(state, tensorDevice, THCState_getCurrentStreamIndex(state)))); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(currentDevice)); } }
/* Usage: cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN}) for streams on the current device. Creates a one-way barrier where waiterStream waits for waitForStream1-N to reach the current point. */ static int cutorch_streamWaitFor(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* Check that the waiting stream is in bounds; this will error out if not */ int waitingId = (int) luaL_checknumber(L, 1); cudaStream_t streamWaiting = THCState_getDeviceStream(state, curDev, waitingId); /* Validate the streams that we are waiting on */ int streams = checkAndCountListOfStreams(L, state, 2, curDev); if (streams < 1) { /* nothing to synchronize */ return 0; } /* One-way dependency; streamWaiting will wait for the list of streams to wait on to complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); createSingleDeviceEvents(L, state, 2, curDev, events); /* Then, wait on them */ for (int i = 0; i < streams; i++) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); THCudaCheck(cudaEventDestroy(events[i])); } free(events); return 0; }
void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src) { THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match"); THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous"); THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous"); if (THCTensor_(nElement)(state, self) == 0) return; // Perform the copy wrt the current stream on the CudaTensor's device. int tensorDevice = THCTensor_(getDevice)(state, self); int currentDevice; THCudaCheck(cudaGetDevice(¤tDevice)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(tensorDevice)); } THCStream *stream = THCState_getStream(state); THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self), THTensor_(data)(src), THTensor_(nElement)(src) * sizeof(real), cudaMemcpyHostToDevice, stream->stream)); THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(currentDevice)); } }
void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); THCudaBlas_shutdown(state); free(state->blasState); free(state->rngState); free(state->deviceProperties); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* Free Torch-defined streams (0 is the default stream) */ for (int stream = 1; stream <= state->numUserStreams; ++stream) { THCudaCheck(cudaStreamDestroy(state->streamsPerDevice[dev][stream])); } free(state->streamsPerDevice[dev]); } free(state->streamsPerDevice); THCudaCheck(cudaSetDevice(prevDev)); }
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess) { if (dev < 0 || dev >= state->numDevices) { THError("%d is not a device", dev); } if (devToAccess < 0 || devToAccess >= state->numDevices) { THError("%d is not a device", devToAccess); } if (state->p2pAccessEnabled[dev][devToAccess] == -1) { int prevDev = 0; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaSetDevice(dev)); int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); if (access) { cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // ignore and clear the error if access was already enabled cudaGetLastError(); } else { THCudaCheck(err); } state->p2pAccessEnabled[dev][devToAccess] = 1; } else { state->p2pAccessEnabled[dev][devToAccess] = 0; } THCudaCheck(cudaSetDevice(prevDev)); } return state->p2pAccessEnabled[dev][devToAccess]; }
/* Usage: cutorch.streamBarrier({stream1, stream2, ..., streamN}) applies to streams for the current device. Creates a N-way barrier to synchronize all of the streams given */ static int cutorch_streamBarrier(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); int streams = checkAndCountListOfStreams(L, state, 1, curDev); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Multi-way dependency (barrier); all streams must complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); /* First, create an event and record them for all streams */ int eventsCreated = createSingleDeviceEvents(L, state, 1, curDev, events); /* Then, wait on the event. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated); for (int i = 0; i < eventsCreated; i++) THCudaCheck(cudaEventDestroy(events[i])); free(events); return 0; }
THCStream* THCStream_newWithPriority(int flags, int priority) { THCStream* self = (THCStream*) malloc(sizeof(THCStream)); self->refcount = 1; THCudaCheck(cudaGetDevice(&self->device)); THCudaCheck(cudaStreamCreateWithPriority(&self->stream, flags, priority)); return self; }
THCStream* THCStream_new(int flags) { THCStream* self = (THCStream*) malloc(sizeof(THCStream)); self->refcount = 1; THCudaCheck(cudaGetDevice(&self->device)); THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags)); return self; }
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value) { THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); }
scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); scalar_t value; cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(scalar_t), cudaMemcpyDeviceToHost, stream)); THCudaCheck(cudaStreamSynchronize(stream)); return value; }
static int cutorch_Event_new(lua_State *L) { cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t)); THCudaCheck(cudaEventCreate(event)); THCState *state = cutorch_getstate(L); THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state))); luaT_pushudata(L, event, "cutorch.Event"); return 1; }
void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking) { if (numStreams <= state->numUserStreams) { return; } int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Otherwise, we have to allocate a new set of streams and stream data */ for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* +1 for the default stream as well */ cudaStream_t* newStreams = (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t)); void** newScratchSpace = (void**) malloc((numStreams + 1) * sizeof(void*)); /* Copy over old stream data (0 is default stream, 1 ... numUserStreams are rest) */ for (int stream = 0; stream <= state->numUserStreams; ++stream) { newStreams[stream] = THCState_getDeviceStream(state, dev, stream); newScratchSpace[stream] = THCState_getDeviceScratchSpace(state, dev, stream); } /* Allocate new stream resources */ size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev); unsigned int flags = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) { newStreams[stream] = NULL; THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags)); newScratchSpace[stream] = NULL; THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize)); } THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev); free(res->streams); res->streams = newStreams; free(res->devScratchSpacePerStream); res->devScratchSpacePerStream = newScratchSpace; } state->numUserStreams = numStreams; THCudaCheck(cudaSetDevice(prevDev)); }
void THCState_setDevice(THCState *state, int device) { int curDev; THCudaCheck(cudaGetDevice(&curDev)); if (device != curDev) { THCudaCheck(cudaSetDevice(device)); THCRandom_setGenerator(state, device); THCudaBlas_setHandle(state, device); /* The stream is per device, so update the stream as well */ THCState_setStream(state, device, THCState_getCurrentStreamIndex(state)); } }
/* Usage: cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter, {[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Stream (gpuWaiter, streamWaiter) will wait on all of the other streams (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way barrier only (only streamWaiter is blocked). The streams to wait on are bucketed per device. Equivalent to streamWaitFor() if only one GPU's streams are listed. */ static int cutorch_streamWaitForMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate waiting (gpu, stream); this will error out if not */ int gpuWaiter = (int) luaL_checknumber(L, 1) - 1; int streamWaiter = (int) luaL_checknumber(L, 2); cudaStream_t streamWaiting = THCState_getDeviceStream(state, gpuWaiter, streamWaiter); /* Validate and count set of {gpu={streams...}} we are waiting on */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams); if (streams < 1) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create and record event per each stream given for that GPU. -For (gpuWaiter, streamWaiter), wait on all of the above events. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 3, events); /* Then, wait on the events */ THCudaCheck(cudaSetDevice(gpuWaiter)); for (int i = 0; i < streams; ++i) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); } /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; }
/* Usage: cutorch.streamSynchronize(n) For the current device, synchronizes with the given stream only (cudaStreamSynchronize). 0 is the default stream on the device. */ static int cutorch_streamSynchronize(lua_State *L) { THCState *state = cutorch_getstate(L); int streamId = (int) luaL_checknumber(L, 1); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* This also validates the stream */ cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId); THCudaCheck(cudaStreamSynchronize(stream)); return 0; }
static int cutorch_getDeviceProperties(lua_State *L) { int device = (int)luaL_checknumber(L, 1)-1; // switch context to given device so the call to cudaMemGetInfo is for the correct device int oldDevice; THCudaCheck(cudaGetDevice(&oldDevice)); THCudaCheck(cudaSetDevice(device)); struct cudaDeviceProp prop; THCudaCheck(cudaGetDeviceProperties(&prop, device)); lua_newtable(L); SET_DEVN_PROP(canMapHostMemory); SET_DEVN_PROP(clockRate); SET_DEVN_PROP(computeMode); SET_DEVN_PROP(deviceOverlap); SET_DEVN_PROP(integrated); SET_DEVN_PROP(kernelExecTimeoutEnabled); SET_DEVN_PROP(major); SET_DEVN_PROP(maxThreadsPerBlock); SET_DEVN_PROP(memPitch); SET_DEVN_PROP(minor); SET_DEVN_PROP(multiProcessorCount); SET_DEVN_PROP(regsPerBlock); SET_DEVN_PROP(sharedMemPerBlock); SET_DEVN_PROP(textureAlignment); SET_DEVN_PROP(totalConstMem); SET_DEVN_PROP(totalGlobalMem); SET_DEVN_PROP(warpSize); SET_DEVN_PROP(pciBusID); SET_DEVN_PROP(pciDeviceID); SET_DEVN_PROP(pciDomainID); SET_DEVN_PROP(maxTexture1D); SET_DEVN_PROP(maxTexture1DLinear); size_t freeMem; THCudaCheck(cudaMemGetInfo (&freeMem, NULL)); lua_pushnumber(L, freeMem); lua_setfield(L, -2, "freeGlobalMem"); lua_pushstring(L, prop.name); lua_setfield(L, -2, "name"); // restore context THCudaCheck(cudaSetDevice(oldDevice)); return 1; }
void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); free(state->rngState); free(state->deviceProperties); int deviceCount = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ for (int dev = 0; dev < deviceCount; ++dev) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); /* cleanup per-device state */ for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]); /* Free user defined BLAS handles */ for (int i = 0; i < res->numBlasHandles; ++i) { THCublasCheck(cublasDestroy(res->blasHandles[i])); } /* Free user defined sparse handles */ for (int i = 0; i < res->numSparseHandles; ++i) { THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); } free(res->blasHandles); free(res->sparseHandles); THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev])); THCThreadLocal_free(state->currentStreams[dev]); } free(state->resourcesPerDevice); if (state->cudaDeviceAllocator->emptyCache) { state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state); } if (state->cudaHostAllocator == &THCCachingHostAllocator) { THCCachingHostAllocator_emptyCache(); } free(state->currentStreams); THCThreadLocal_free(state->currentPerDeviceBlasHandle); THCudaCheck(cudaSetDevice(prevDev)); }
struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state) { int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); return &(state->deviceProperties[curDev]); }
static PyObject * THPStorage_(shareCuda)(THPStorage *self) { HANDLE_TH_ERRORS THStorage *storage = self->cdata; AutoGPU gpu_guard(storage->device); THPObjectPtr tuple(PyTuple_New(5)); THPObjectPtr device(PyLong_FromLong(storage->device)); THPObjectPtr _handle(Py_None); Py_INCREF(Py_None); THPObjectPtr size(PyLong_FromLong(storage->size)); THPObjectPtr _offset(PyLong_FromLong(0)); THPObjectPtr view_size(PyLong_FromLong(storage->size)); if (storage->data) { size_t base_size; void *base_ptr = THCCachingAllocator_getBaseAllocation(storage->data, &base_size); ptrdiff_t offset = (char*)storage->data - (char*)base_ptr; cudaIpcMemHandle_t handle; THCudaCheck(cudaIpcGetMemHandle(&handle, base_ptr)); _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE); _offset = PyLong_FromSsize_t((Py_ssize_t)offset); size = PyLong_FromSize_t(base_size / sizeof(real)); } if (!tuple || !device || !_handle || !size || !_offset || !view_size) { return NULL; } PyTuple_SET_ITEM(tuple.get(), 0, device.release()); PyTuple_SET_ITEM(tuple.get(), 1, _handle.release()); PyTuple_SET_ITEM(tuple.get(), 2, size.release()); PyTuple_SET_ITEM(tuple.get(), 3, _offset.release()); PyTuple_SET_ITEM(tuple.get(), 4, view_size.release()); return tuple.release(); END_HANDLE_TH_ERRORS }
float THCudaStorage_get(const THCudaStorage *self, long index) { float value; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(float), cudaMemcpyDeviceToHost)); return value; }
/* * Based on the implementation of the THTensor_(indexCopy) in torch7 */ static void THCudaTensor_indexCopy(THCudaTensor *tensor, int dim, THLongTensor *index, THCudaTensor *src) { long i, numel; THCudaTensor *tSlice, *sSlice; long *index_data; numel = THLongTensor_nElement(index); THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < src->nDimension,4,"Indexing dim is out of bounds"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i<numel; i++) { if (tensor->nDimension > 1 ) { tSlice = THCudaTensor_new(); sSlice = THCudaTensor_new(); THCudaTensor_select(tSlice, tensor, dim, index_data[i]-1); THCudaTensor_select(sSlice, src, dim, i); THCudaTensor_copy(tSlice, sSlice); THCudaTensor_free(tSlice); THCudaTensor_free(sSlice); } else { // It's faster to copy a float from an address in the device to another address in the device than // retrieving it to the host memory and recopy it to the device memory THCudaCheck(cudaMemcpy(tensor->storage->data + tensor->storageOffset + index_data[i]-1,\ src->storage->data + src->storageOffset + i, sizeof(float), cudaMemcpyDeviceToDevice)); } } THLongTensor_free(index); }
static int cutorch_getDeviceProperties(lua_State *L) { struct cudaDeviceProp prop; int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaGetDeviceProperties(&prop, device)); lua_newtable(L); SET_DEVN_PROP(canMapHostMemory); SET_DEVN_PROP(clockRate); SET_DEVN_PROP(computeMode); SET_DEVN_PROP(deviceOverlap); SET_DEVN_PROP(integrated); SET_DEVN_PROP(kernelExecTimeoutEnabled) SET_DEVN_PROP(major); SET_DEVN_PROP(maxThreadsPerBlock); SET_DEVN_PROP(memPitch); SET_DEVN_PROP(minor); SET_DEVN_PROP(multiProcessorCount); SET_DEVN_PROP(regsPerBlock); SET_DEVN_PROP(sharedMemPerBlock); SET_DEVN_PROP(textureAlignment); SET_DEVN_PROP(totalConstMem); SET_DEVN_PROP(totalGlobalMem); SET_DEVN_PROP(warpSize); lua_pushstring(L, prop.name); lua_setfield(L, -2, "name"); return 1; }
static int cutorch_getDeviceCount(lua_State *L) { int ndevice; THCudaCheck(cudaGetDeviceCount(&ndevice)); lua_pushnumber(L, ndevice); return 1; }
static int cutorch_setDevice(lua_State *L) { int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaSetDevice(device)); THCRandom_manualSeed(THCRandom_initialSeed()); return 0; }
void* THCState_getCurrentDeviceScratchSpace(THCState* state) { int device = -1; THCudaCheck(cudaGetDevice(&device)); int stream = THCState_getCurrentStreamIndex(state); return THCState_getDeviceScratchSpace(state, device, stream); }
hostreal THCStorage_(get)(THCState *state, const THCStorage *self, long index) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); #ifndef THC_REAL_IS_HALF real value; THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost)); return realToHostreal(value); #else float *ret_d; float ret; THCudaCheck(THCudaMalloc(state, (void**)&ret_d, sizeof(float))); THCHalf2Float(state, ret_d, self->data + index, 1); THCudaCheck(cudaMemcpy(&ret, ret_d, sizeof(float), cudaMemcpyDeviceToHost)); THCudaFree(state, ret_d); return ret; #endif }
static int cutorch_getDevice(lua_State *L) { int device; THCudaCheck(cudaGetDevice(&device)); device++; lua_pushnumber(L, device); return 1; }
static int cutorch_Event_waitOn(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCState *state = cutorch_getstate(L); THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0)); return 0; }
static int cutorch_Event_free(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCudaCheck(cudaEventDestroy(*event)); luaT_free(L, event); return 0; }
/* Synchronizes the host with respect to all devices */ static int cutorch_synchronizeAll(lua_State *L) { int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); int devices = -1; THCudaCheck(cudaGetDeviceCount(&devices)); for (int i = 0; i < devices; ++i) { THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaDeviceSynchronize()); } THCudaCheck(cudaSetDevice(prevDev)); return 0; }